This notebook reports the analysis of snvs mutational signatures at
constitutive origins.
Cancer genome
clustering by mutational signatures
Clustering based on
cosine similarities
Cancer genomes are clustered based on the mutational profiles at
origins. To do so, we use the background adjusted mutational signatures
(corrected for both trinucleotide composition and rates observed at
flanking domains).
# r
library(tidyverse)
library(dendextend)
library(tibble)
library(gplots)
library(rlist)
library(MutationalPatterns)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load mutational differential matrix
snvs.diff.mut.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.diff.mut.mat.rds")
# Compute cosine similarity matrix
snvs.cos.sim.mat <- cos_sim_matrix(snvs.diff.mut.mat, snvs.diff.mut.mat)
# cluster samples based on euclidean distance between relative contribution
snvs.cos.sim.hc <- hclust(dist(snvs.cos.sim.mat), method = "complete")
# order samples according to clustering
row.order <- rownames(snvs.cos.sim.mat)[snvs.cos.sim.hc$order]
# Generate dendrogram
snvs.cos.sim.dend <- as.dendrogram(snvs.cos.sim.hc)
# Define and identify number of relevant clusters
# Visual inspection reveals 5 clusters
nleaves(snvs.cos.sim.dend) # 227
## [1] 227
nnodes(snvs.cos.sim.dend) # 453
## [1] 453
plot(color_branches(snvs.cos.sim.dend, k=5, groupLabels=T),leaflab="none")

# Visual inspection reveals 5 clusters
# Define and cut clusters
mut.sig.clusters <- cutree(snvs.cos.sim.dend, k=5, order_clusters_as_data = FALSE)
table(mut.sig.clusters)
## mut.sig.clusters
## 1 2 3 4 5
## 42 42 29 23 91
# Generate df reporting the information about genomes and their cluster assignments
clusters.df <- data.frame(donor.id = names(mut.sig.clusters), cluster = mut.sig.clusters)
saveRDS(clusters.df, file = "./002_Mut_signatures_Pan_cancer/rds/clusters.df.rds")
# Cluster columns
hc.sample <- snvs.cos.sim.mat %>% t() %>% dist() %>% hclust(method = "complete")
# Define column and row order
col.order <- colnames(snvs.cos.sim.mat)[hc.sample$order]
row.order <- rownames(snvs.cos.sim.mat)[hc.sample$order]
# Make matrix long and set factor levels, to get the correct order for plotting.
snvs.cos.sim.mat.plot <- snvs.cos.sim.mat %>% as.data.frame() %>% tibble::rownames_to_column("Sample") %>%
pivot_longer(-Sample, names_to = "Signature", values_to = "Cosine.sim") %>%
mutate(Signature = factor(Signature, levels = col.order), Sample = factor(Sample, levels = row.order))
# Plot heatmap
snvs.cos.sim.heatmap <- snvs.cos.sim.mat.plot %>% ggplot(aes(x = Signature, y = Sample, fill = Cosine.sim, order = Sample)) +
geom_raster() +
scale_fill_distiller(palette = "YlOrBr", direction = 1, name = "Cosine \nsimilarity", limits = c(0, 1.000000001)) +
theme_bw() +
theme(aspect.ratio=1, axis.ticks = element_blank(), axis.text.x=element_blank(), axis.text.y=element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + labs(x = NULL, y = NULL)
snvs.cos.sim.heatmap

pdf("./Rplots/snvs.cos.sim.dend.pdf", width=10, height=6, useDingbats=FALSE)
plot(color_branches(snvs.cos.sim.dend, k=5, groupLabels=T),leaflab="none")
dev.off()
## quartz_off_screen
## 2
pdf("./Rplots/snvs.cos.sim.heatmap.pdf", width=10, height=6, useDingbats=FALSE)
snvs.cos.sim.heatmap
dev.off()
## quartz_off_screen
## 2
WARNING: The heatmap is horizontally flipped in the final figure
Mutational cluster
analysis
Primary sites
Assess primary sites distribution within each cluster
# r
library(dplyr)
library(ggplot2)
library(ggpubr)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load cluster information
clusters.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/clusters.df.rds")
table(clusters.df$cluster)
##
## 1 2 3 4 5
## 42 42 29 23 91
# Open manifest
SSM.manifest.df <- read.table(gzfile("./Dataset/ICGC/SSM/manifest.1698683356752.tar.gz"), skip = 1)
SSM.manifest.df <- SSM.manifest.df %>% dplyr::select(V9, V10) %>% separate(V10, c("cancer.type", "project"), sep = "-")
colnames(SSM.manifest.df) <- c("donor.id", "cancer.type", "project")
# Load cancer project information
cancer.project.info.df <- read.csv("./Dataset/ICGC/projects_2023_10_30_04_53_06.tsv", sep = "\t") %>% separate(Project.Code, c("cancer.type", "project"), sep = "-") %>% dplyr::select(cancer.type, Primary.Site)
# Add cancer/primary site information to clusters
clusters.cancer.df <- clusters.df %>% left_join(SSM.manifest.df, by = "donor.id") %>% left_join(cancer.project.info.df, by = "cancer.type") %>% distinct() %>% mutate(cancer.info = paste(cancer.type, Primary.Site, sep = " | "))
saveRDS(clusters.cancer.df, file = "./002_Mut_signatures_Pan_cancer/rds/clusters.cancer.df.rds")
# Assess primary sites distribution within each cluster
cluster.1.cancer.count <- clusters.cancer.df %>% filter(cluster == 1) %>% group_by(cancer.info) %>% summarise(count=n())
cluster.2.cancer.count <- clusters.cancer.df %>% filter(cluster == 2) %>% group_by(cancer.info) %>% summarise(count=n())
cluster.3.cancer.count <- clusters.cancer.df %>% filter(cluster == 3) %>% group_by(cancer.info) %>% summarise(count=n())
cluster.4.cancer.count <- clusters.cancer.df %>% filter(cluster == 4) %>% group_by(cancer.info) %>% summarise(count=n())
cluster.5.cancer.count <- clusters.cancer.df %>% filter(cluster == 5) %>% group_by(cancer.info) %>% summarise(count=n())
# Prepare donut charts
# Color pallette is defined in 003_Transcriptomics_cluster_analysis
hsize <- 2
#
cluster.1.cancer.donut <- cluster.1.cancer.count %>% mutate(x = hsize) %>%
ggplot(aes(x = hsize, y = count, fill = cancer.info)) +
geom_col(color = "black", alpha = 0.6) +
geom_label(aes(label = cancer.info), position = position_stack(vjust = 0.5), show.legend = FALSE) +
coord_polar(theta = "y") + xlim(c(0.2, hsize + 0.5)) +
ggtitle("cluster 1") +
scale_fill_manual(values = c("#F9B233")) +
theme(panel.background = element_rect(fill = "white"), panel.grid = element_blank(), axis.title = element_blank(), axis.ticks = element_blank(), axis.text = element_blank(), legend.position = "none")
cluster.1.cancer.donut

#
cluster.2.cancer.donut <- cluster.2.cancer.count %>% mutate(x = hsize) %>%
ggplot(aes(x = hsize, y = count, fill = cancer.info)) +
geom_col(color = "black", alpha = 0.6) +
geom_label(aes(label = cancer.info), position = position_stack(vjust = 0.5), show.legend = FALSE) +
coord_polar(theta = "y") + xlim(c(0.2, hsize + 0.5)) +
ggtitle("cluster 2") +
scale_fill_manual(values = c("#A084BC", "#9D9D9C", "#BE1622", "#A1D1B1", "#F9B233", "#662483", "#006633", "#29235C")) +
theme(panel.background = element_rect(fill = "white"), panel.grid = element_blank(), axis.title = element_blank(), axis.ticks = element_blank(), axis.text = element_blank(), legend.position = "none")
cluster.2.cancer.donut

#
cluster.3.cancer.donut <- cluster.3.cancer.count %>% mutate(x = hsize) %>%
ggplot(aes(x = hsize, y = count, fill = cancer.info)) +
geom_col(color = "black", alpha = 0.6) +
geom_label(aes(label = cancer.info), position = position_stack(vjust = 0.5), show.legend = FALSE) +
coord_polar(theta = "y") + xlim(c(0.2, hsize + 0.5)) +
ggtitle("cluster 3") +
scale_fill_manual(values = c("#9D9D9C", "#A1D1B1", "#EA83B3", "#1D71B8", "#E94E1B", "#662483", "#006633", "#29235C", "#DEDC00", "#B17F4A")) +
theme(panel.background = element_rect(fill = "white"), panel.grid = element_blank(), axis.title = element_blank(), axis.ticks = element_blank(), axis.text = element_blank(), legend.position = "none")
cluster.3.cancer.donut

#
cluster.4.cancer.donut <- cluster.4.cancer.count %>% mutate(x = hsize) %>%
ggplot(aes(x = hsize, y = count, fill = cancer.info)) +
geom_col(color = "black", alpha = 0.6) +
geom_label(aes(label = cancer.info), position = position_stack(vjust = 0.5), show.legend = FALSE) +
coord_polar(theta = "y") + xlim(c(0.2, hsize + 0.5)) +
ggtitle("cluster 4") +
scale_fill_manual(values = c("#9B7277")) +
theme(panel.background = element_rect(fill = "white"), panel.grid = element_blank(), axis.title = element_blank(), axis.ticks = element_blank(), axis.text = element_blank(), legend.position = "none")
cluster.4.cancer.donut

#
cluster.5.cancer.donut <- cluster.5.cancer.count %>% mutate(x = hsize) %>%
ggplot(aes(x = hsize, y = count, fill = cancer.info)) +
geom_col(color = "black", alpha = 0.6) +
geom_label(aes(label = cancer.info), position = position_stack(vjust = 0.5), show.legend = FALSE) +
coord_polar(theta = "y") + xlim(c(0.2, hsize + 0.5)) +
ggtitle("cluster 5") +
scale_fill_manual(values = c("#9D9D9C", "#683C11", "#A1D1B1", "#EA83B3", "#073F60", "#1D71B8", "#BE1622", "#F9B233", "#E94E1B", "#662483", "#006633", "#DEDC00", "#B17F4A")) +
theme(panel.background = element_rect(fill = "white"), panel.grid = element_blank(), axis.title = element_blank(), axis.ticks = element_blank(), axis.text = element_blank(), legend.position = "none")
cluster.5.cancer.donut

pdf("./Rplots/snvs.sig.cluster.donuts.pdf", width=10, height=6, useDingbats=FALSE)
ggarrange(cluster.1.cancer.donut, cluster.2.cancer.donut, cluster.3.cancer.donut, cluster.4.cancer.donut, cluster.5.cancer.donut, nrow = 2, ncol = 3)
dev.off()
## quartz_off_screen
## 2
Aggregated
mutational signatures
Compute aggregated mutational signatures for each clusters
# r
library(dplyr)
library(MutationalPatterns)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load cluster information
clusters.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/clusters.df.rds")
# Load mutational matrix
snvs.diff.mut.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.diff.mut.mat.rds")
# Sum mutation counts per clusters
cluster.1.id <- (clusters.df %>% filter(cluster == 1))$donor.id
snvs.ori.mut.cluster.1 <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.1.id)) %>% mutate(cluster.1 = rowSums(.)) %>% dplyr::select(cluster.1)
cluster.2.id <- (clusters.df %>% filter(cluster == 2))$donor.id
snvs.ori.mut.cluster.2 <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.2.id)) %>% mutate(cluster.2 = rowSums(.)) %>% dplyr::select(cluster.2)
cluster.3.id <- (clusters.df %>% filter(cluster == 3))$donor.id
snvs.ori.mut.cluster.3 <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.3.id)) %>% mutate(cluster.3 = rowSums(.)) %>% dplyr::select(cluster.3)
cluster.4.id <- (clusters.df %>% filter(cluster == 4))$donor.id
snvs.ori.mut.cluster.4 <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.4.id)) %>% mutate(cluster.4 = rowSums(.)) %>% dplyr::select(cluster.4)
cluster.5.id <- (clusters.df %>% filter(cluster == 5))$donor.id
snvs.ori.mut.cluster.5 <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.5.id)) %>% mutate(cluster.5 = rowSums(.)) %>% dplyr::select(cluster.5)
# Combine information
snvs.ori.mut.cluster <- cbind.data.frame(snvs.ori.mut.cluster.1, snvs.ori.mut.cluster.2, snvs.ori.mut.cluster.3,
snvs.ori.mut.cluster.4, snvs.ori.mut.cluster.5)
snvs.ori.mut.cluster <- as.matrix(snvs.ori.mut.cluster)
saveRDS(snvs.ori.mut.cluster, "./002_Mut_signatures_Pan_cancer/rds/snvs.ori.mut.cluster.signatures.rds")
# Plot profiles
plot_96_profile(snvs.ori.mut.cluster, condensed = TRUE, ymax = 0.24)

pdf("./Rplots/snvs.ori.mut.cluster.pdf", width=5, height=6, useDingbats=FALSE)
plot_96_profile(snvs.ori.mut.cluster, condensed = TRUE, ymax = 0.24)
dev.off()
## quartz_off_screen
## 2
Assess the distribution of number of mutations within clusters
# r
library(dplyr)
library(MutationalPatterns)
library(scales)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load cluster information
clusters.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/clusters.df.rds")
# Load snvs count summary
snvs.summary.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.summary.df.rds")
# Combine
clusters.snvs.count.df <- clusters.df %>% left_join(snvs.summary.df, by = "donor.id")
# Plot
clusters.snvs.count.plot <- clusters.snvs.count.df %>% filter(snvs.count >= 50) %>%
ggplot(aes(x=as.character(cluster), y=snvs.count, fill="#E41F1A", alpha = 0.3)) +
geom_boxplot(outlier.shape = NA, width = 0.5) +
geom_jitter(color="black", size=0.4, alpha=0.4) + ylab("snvs count at origins") +
scale_y_continuous(trans = log10_trans(), limits=c(50,1500)) +
theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"), legend.position = "none", axis.title.x=element_blank(), axis.text.x=element_text(angle=45,hjust=1))
clusters.snvs.count.plot

pdf("./Rplots/clusters.snvs.count.plot.pdf", width=7, height=6, useDingbats=FALSE)
clusters.snvs.count.plot
dev.off()
## quartz_off_screen
## 2
Assess the distribution of the ratio of snvs at origins vs flanks for
each cluster
# r
library(dplyr)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load cluster information
clusters.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/clusters.df.rds")
# Load snvs density ratio df
snvs.dens.ratio.df <- readRDS("./001_Mut_density_Pan_cancer/rds/snvs.dens.ratio.df.rds")
colnames(snvs.dens.ratio.df)[1] <- "donor.id"
# Combine information
clusters.density.df <- clusters.df %>% left_join(snvs.dens.ratio.df, by = "donor.id")
# Plot
clusters.density.plot <- clusters.density.df %>%
ggplot(aes(x=as.character(cluster), y=snvs.dens.ratio, fill="#E41F1A", alpha = 0.3)) +
geom_boxplot(outlier.shape = NA, width = 0.5) +
geom_jitter(color="black", size=0.4, alpha=0.4) + ylab("Mutation density ratio (ori/flanks)") +
geom_hline(yintercept=1, linetype="dashed") +
theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"), legend.position = "none", axis.title.x=element_blank(), axis.text.x=element_text(angle=45,hjust=1))
clusters.density.plot

pdf("./Rplots/clusters.density.plot.pdf", width=7, height=6, useDingbats=FALSE)
clusters.density.plot
dev.off()
## quartz_off_screen
## 2
Cluster 5
subclusters
Characterise cluster 5 subclusters
# R
library(tidyverse)
library(dendextend)
library(tibble)
library(gplots)
library(rlist)
library(MutationalPatterns)
setwd("~/Desktop/Projects/OriCan")
# Load mutational differential matrix
snvs.diff.mut.mat <- readRDS("~/Desktop/Projects/OriCan/002_Mut_signatures_Pan_cancer/rds/snvs.diff.mut.mat.rds")
# Compute cosine similarity matrix
snvs.cos.sim.mat <- cos_sim_matrix(snvs.diff.mut.mat, snvs.diff.mut.mat)
# cluster samples based on euclidean distance between relative contribution
snvs.cos.sim.hc <- hclust(dist(snvs.cos.sim.mat), method = "complete")
# order samples according to clustering
row.order <- rownames(snvs.cos.sim.mat)[snvs.cos.sim.hc$order]
# Generate dendrogram
snvs.cos.sim.dend <- as.dendrogram(snvs.cos.sim.hc)
# Plot original clusters
plot(color_branches(snvs.cos.sim.dend, k=5, groupLabels=T),leaflab="none")
# Split cluster 5
plot(color_branches(snvs.cos.sim.dend, k=11, groupLabels=T),leaflab="none")
# Define subclusters
# cluster 5-1, join cluster 7 and 8
# cluster 5-2, cluster 9
# cluster 5-3, cluster 10
# cluster 5-4, cluster 11
# Define and cut clusters
mut.sig.sub.clusters <- cutree(snvs.cos.sim.dend, k=11, order_clusters_as_data = FALSE)
table(mut.sig.sub.clusters)
# Generate df reporting the information about genomes and their cluster assignments
sub.clusters.df <- data.frame(donor.id = names(mut.sig.sub.clusters), cluster = mut.sig.sub.clusters)
# Define cluster 5 sub-clusters
cluster.5.1.id <- (sub.clusters.df %>% filter(cluster == "7" | cluster == "8"))$donor.id
cluster.5.2.id <- (sub.clusters.df %>% filter(cluster == "9"))$donor.id
cluster.5.3.id <- (sub.clusters.df %>% filter(cluster == "10"))$donor.id
cluster.5.4.id <- (sub.clusters.df %>% filter(cluster == "11"))$donor.id
# Compute mutational signatures for sub-clusters
cluster.5.1.mut <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.5.1.id)) %>% mutate(cluster.5.1 = rowSums(.)) %>% dplyr::select(cluster.5.1)
cluster.5.2.mut <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.5.2.id)) %>% mutate(cluster.5.2 = rowSums(.)) %>% dplyr::select(cluster.5.2)
cluster.5.3.mut <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.5.3.id)) %>% mutate(cluster.5.3 = rowSums(.)) %>% dplyr::select(cluster.5.3)
cluster.5.4.mut <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.5.4.id)) %>% mutate(cluster.5.4 = rowSums(.)) %>% dplyr::select(cluster.5.4)
# Combine signatures
cluster.5.sub.clusters.mut <- cbind.data.frame(cluster.5.1.mut, cluster.5.2.mut, cluster.5.3.mut, cluster.5.4.mut)
saveRDS(cluster.5.sub.clusters.mut, "~/Desktop/Projects/OriCan/002_Mut_signatures_Pan_cancer/rds/cluster.5.sub.clusters.mut.rds")
Plot signatures
# r
library(dplyr)
library(MutationalPatterns)
# Load cluster information
cluster.5.sub.clusters.mut <- readRDS("~/Desktop/Projects/OriCan/002_Mut_signatures_Pan_cancer/rds/cluster.5.sub.clusters.mut.rds")
plot_96_profile(cluster.5.sub.clusters.mut, condensed = TRUE, ymax = 0.45)

pdf("~/Desktop/Projects/OriCan/Manuscript/Nature Communications/Rplots/cluster.5.sub.clusters.signatures.pdf", width=5, height=4, useDingbats=FALSE)
plot_96_profile(cluster.5.sub.clusters.mut, condensed = TRUE, ymax = 0.45)
dev.off()
## quartz_off_screen
## 2
Origin-associated
signature exposure
Compute mutation
count matrices
Mutations from genomes of each cluster are combined and spatially
resolved in bins of 100 nt around origins. The extracted signatures are
then fitted to the mutation count matrices in order to characterise
their spatial contribution.
Combine vcf files
# Cluster 1 - skin melanomas
# Recover donor id, file names and paths
cluster.1.donor.id <- (clusters.cancer.df %>% filter(cluster == 1))$donor.id
cluster.1.vcf.name <- (snvs.manifest.df %>% filter(donor.id %in% cluster.1.donor.id))$file.name # 42 genomes
cluster.1.vcf.path <- paste("./Dataset/ICGC/SSM/VCF/", cluster.1.vcf.name, sep = "")
# Combine vcf files
cluster.1.snvs.df <- tibble()
for (i in 1:length(cluster.1.vcf.path)) {
print(i/length(cluster.1.vcf.path))
vcf.i <- read.table(gzfile(cluster.1.vcf.path[i]), comment.char = "#") %>% dplyr::select(V1, V2, V4, V5) %>% mutate(end = as.integer(V2 + 1)) %>% dplyr::select(seqnames = V1, start = V2, end, REF = V4, ALT = V5)
cluster.1.snvs.df <- rbind(cluster.1.snvs.df, vcf.i)
}
write.table(cluster.1.snvs.df, "./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.1.snvs.bed", sep="\t", col.names = F, row.names = F, quote = F)
# Cluster 2 - diverse genomes
# Recover donor id, file names and paths
cluster.2.donor.id <- (clusters.cancer.df %>% filter(cluster == 2))$donor.id
cluster.2.vcf.name <- (snvs.manifest.df %>% filter(donor.id %in% cluster.2.donor.id))$file.name # 46 genomes
cluster.2.vcf.path <- paste("./Dataset/ICGC/SSM/VCF/", cluster.2.vcf.name, sep = "")
# Combine vcf files
cluster.2.snvs.df <- tibble()
for (i in 1:length(cluster.2.vcf.path)) {
print(i/length(cluster.2.vcf.path))
vcf.i <- read.table(gzfile(cluster.2.vcf.path[i]), comment.char = "#") %>% dplyr::select(V1, V2, V4, V5) %>% mutate(end = as.integer(V2 + 1)) %>% dplyr::select(seqnames = V1, start = V2, end, REF = V4, ALT = V5)
cluster.2.snvs.df <- rbind(cluster.2.snvs.df, vcf.i)
}
write.table(cluster.2.snvs.df, "./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.2.snvs.bed", sep="\t", col.names = F, row.names = F, quote = F)
# Cluster 3 - diverse genomes
# Recover donor id, file names and paths
cluster.3.donor.id <- (clusters.cancer.df %>% filter(cluster == 3))$donor.id
cluster.3.vcf.name <- (snvs.manifest.df %>% filter(donor.id %in% cluster.3.donor.id))$file.name # 29 genomes
cluster.3.vcf.path <- paste("./Dataset/ICGC/SSM/VCF/", cluster.3.vcf.name, sep = "")
# Combine vcf files
cluster.3.snvs.df <- tibble()
for (i in 1:length(cluster.3.vcf.path)) {
print(i/length(cluster.3.vcf.path))
vcf.i <- read.table(gzfile(cluster.3.vcf.path[i]), comment.char = "#") %>% dplyr::select(V1, V2, V4, V5) %>% mutate(end = as.integer(V2 + 1)) %>% dplyr::select(seqnames = V1, start = V2, end, REF = V4, ALT = V5)
cluster.3.snvs.df <- rbind(cluster.3.snvs.df, vcf.i)
}
write.table(cluster.3.snvs.df, "./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.3.snvs.bed", sep="\t", col.names = F, row.names = F, quote = F)
# Cluster 4 - chronic myeloid disorders
# Recover donor id, file names and paths
cluster.4.donor.id <- (clusters.cancer.df %>% filter(cluster == 4))$donor.id
cluster.4.vcf.name <- (snvs.manifest.df %>% filter(donor.id %in% cluster.4.donor.id))$file.name # 23 genomes
cluster.4.vcf.path <- paste("./Dataset/ICGC/SSM/VCF/", cluster.4.vcf.name, sep = "")
# Combine vcf files
cluster.4.snvs.df <- tibble()
for (i in 1:length(cluster.4.vcf.path)) {
print(i/length(cluster.4.vcf.path))
vcf.i <- read.table(gzfile(cluster.4.vcf.path[i]), comment.char = "#") %>% dplyr::select(V1, V2, V4, V5) %>% mutate(end = as.integer(V2 + 1)) %>% dplyr::select(seqnames = V1, start = V2, end, REF = V4, ALT = V5)
cluster.4.snvs.df <- rbind(cluster.4.snvs.df, vcf.i)
}
write.table(cluster.4.snvs.df, "./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.4.snvs.bed", sep="\t", col.names = F, row.names = F, quote = F)
# Cluster 5 - diverse genomes
# Recover donor id, file names and paths
cluster.5.donor.id <- (clusters.cancer.df %>% filter(cluster == 5))$donor.id
cluster.5.vcf.name <- (snvs.manifest.df %>% filter(donor.id %in% cluster.5.donor.id))$file.name # 93 genomes
cluster.5.vcf.path <- paste("./Dataset/ICGC/SSM/VCF/", cluster.5.vcf.name, sep = "")
# Combine vcf files
cluster.5.snvs.df <- tibble()
for (i in 1:length(cluster.5.vcf.path)) {
print(i/length(cluster.5.vcf.path))
vcf.i <- read.table(gzfile(cluster.5.vcf.path[i]), comment.char = "#") %>% dplyr::select(V1, V2, V4, V5) %>% mutate(end = as.integer(V2 + 1)) %>% dplyr::select(seqnames = V1, start = V2, end, REF = V4, ALT = V5)
cluster.5.snvs.df <- rbind(cluster.5.snvs.df, vcf.i)
}
write.table(cluster.5.snvs.df, "./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.5.snvs.bed", sep="\t", col.names = F, row.names = F, quote = F)
Compute snvs-origin distances and filter mutations at less than 10kb
of an origin
# bash/SLURM
# Cluster 1 - skin melanomas
sort -k1,1 -k2,2n /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.1.snvs.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.1.snvs.sorted.bed
bedtools closest -a /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.1.snvs.sorted.bed \
-b /cephfs2/pmurat/OriCan/Dataset/ori/ori.inter.hg19.NCBI.bed -D ref \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.1.snvs.closest.bed
awk '$12>=-10000 && $12<=10000' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.1.snvs.closest.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.1.snvs.10kb.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.1.snvs.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.1.snvs.sorted.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.1.snvs.closest.bed
# Cluster 2 - diverse genomes
sort -k1,1 -k2,2n /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.2.snvs.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.2.snvs.sorted.bed
bedtools closest -a /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.2.snvs.sorted.bed \
-b /cephfs2/pmurat/OriCan/Dataset/ori/ori.inter.hg19.NCBI.bed -D ref \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.2.snvs.closest.bed
awk '$12>=-10000 && $12<=10000' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.2.snvs.closest.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.2.snvs.10kb.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.2.snvs.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.2.snvs.sorted.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.2.snvs.closest.bed
# Cluster 3 - diverse genomes
sort -k1,1 -k2,2n /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.3.snvs.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.3.snvs.sorted.bed
bedtools closest -a /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.3.snvs.sorted.bed \
-b /cephfs2/pmurat/OriCan/Dataset/ori/ori.inter.hg19.NCBI.bed -D ref \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.3.snvs.closest.bed
awk '$12>=-10000 && $12<=10000' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.3.snvs.closest.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.3.snvs.10kb.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.3.snvs.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.3.snvs.sorted.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.3.snvs.closest.bed
# Cluster 4 - chronic myeloid disorders
sort -k1,1 -k2,2n /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.4.snvs.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.4.snvs.sorted.bed
bedtools closest -a /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.4.snvs.sorted.bed \
-b /cephfs2/pmurat/OriCan/Dataset/ori/ori.inter.hg19.NCBI.bed -D ref \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.4.snvs.closest.bed
awk '$12>=-10000 && $12<=10000' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.4.snvs.closest.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.4.snvs.10kb.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.4.snvs.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.4.snvs.sorted.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.4.snvs.closest.bed
# Cluster 5 - diverse genomes
sort -k1,1 -k2,2n /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.5.snvs.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.5.snvs.sorted.bed
bedtools closest -a /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.5.snvs.sorted.bed \
-b /cephfs2/pmurat/OriCan/Dataset/ori/ori.inter.hg19.NCBI.bed -D ref \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.5.snvs.closest.bed
awk '$12>=-10000 && $12<=10000' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.5.snvs.closest.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.5.snvs.10kb.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.5.snvs.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.5.snvs.sorted.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.5.snvs.closest.bed
Split bed files according to origin distances
# r
# Cluster 1 - skin melanomas
cluster.1.snvs.10kb.df <- read.table("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.1.snvs.10kb.bed", sep = "\t")
cluster.1.snvs.10kb.df <- cluster.1.snvs.10kb.df %>% mutate(dist = (as.numeric(cut(-V12, breaks = seq(-10000, 10000, 100)))-100)*100) %>% drop_na()
for (i in unique(cluster.1.snvs.10kb.df$dist)) {
cluster.1.vcf <- cluster.1.snvs.10kb.df %>% filter(dist == i) %>%
mutate(ID = paste("rs", V2, sep = ""), QUAL = ".", FILTER = ".", INFO = ".", FORMAT = ".") %>%
dplyr::select(V1, V2, ID, V4, V5, QUAL, FILTER, INFO, FORMAT)
colnames(cluster.1.vcf) <- c('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT')
vcffile <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_1/Bin_",i,".bed")
write.table(cluster.1.vcf, vcffile, sep="\t", col.names = T, row.names = F, quote = F)
}
# Cluster 2 - diverse genomes
cluster.2.snvs.10kb.df <- read.table("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.2.snvs.10kb.bed", sep = "\t")
cluster.2.snvs.10kb.df <- cluster.2.snvs.10kb.df %>% mutate(dist = (as.numeric(cut(-V12, breaks = seq(-10000, 10000, 100)))-100)*100) %>% drop_na()
for (i in unique(cluster.2.snvs.10kb.df$dist)) {
cluster.2.vcf <- cluster.2.snvs.10kb.df %>% filter(dist == i) %>%
mutate(ID = paste("rs", V2, sep = ""), QUAL = ".", FILTER = ".", INFO = ".", FORMAT = ".") %>%
dplyr::select(V1, V2, ID, V4, V5, QUAL, FILTER, INFO, FORMAT)
colnames(cluster.2.vcf) <- c('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT')
vcffile <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_2/Bin_",i,".bed")
write.table(cluster.2.vcf, vcffile, sep="\t", col.names = T, row.names = F, quote = F)
}
# Cluster 3 - diverse genomes
cluster.3.snvs.10kb.df <- read.table("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.3.snvs.10kb.bed", sep = "\t")
cluster.3.snvs.10kb.df <- cluster.3.snvs.10kb.df %>% mutate(dist = (as.numeric(cut(-V12, breaks = seq(-10000, 10000, 100)))-100)*100) %>% drop_na()
for (i in unique(cluster.3.snvs.10kb.df$dist)) {
cluster.3.vcf <- cluster.3.snvs.10kb.df %>% filter(dist == i) %>%
mutate(ID = paste("rs", V2, sep = ""), QUAL = ".", FILTER = ".", INFO = ".", FORMAT = ".") %>%
dplyr::select(V1, V2, ID, V4, V5, QUAL, FILTER, INFO, FORMAT)
colnames(cluster.3.vcf) <- c('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT')
vcffile <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_3/Bin_",i,".bed")
write.table(cluster.3.vcf, vcffile, sep="\t", col.names = T, row.names = F, quote = F)
}
# Cluster 4 - chronic myeloid disorders
cluster.4.snvs.10kb.df <- read.table("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.4.snvs.10kb.bed", sep = "\t")
cluster.4.snvs.10kb.df <- cluster.4.snvs.10kb.df %>% mutate(dist = (as.numeric(cut(-V12, breaks = seq(-10000, 10000, 100)))-100)*100) %>% drop_na()
for (i in unique(cluster.4.snvs.10kb.df$dist)) {
cluster.4.vcf <- cluster.4.snvs.10kb.df %>% filter(dist == i) %>%
mutate(ID = paste("rs", V2, sep = ""), QUAL = ".", FILTER = ".", INFO = ".", FORMAT = ".") %>%
dplyr::select(V1, V2, ID, V4, V5, QUAL, FILTER, INFO, FORMAT)
colnames(cluster.4.vcf) <- c('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT')
vcffile <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_4/Bin_",i,".bed")
write.table(cluster.4.vcf, vcffile, sep="\t", col.names = T, row.names = F, quote = F)
}
# Cluster 5 - diverse genomes
cluster.5.snvs.10kb.df <- read.table("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.5.snvs.10kb.bed", sep = "\t")
cluster.5.snvs.10kb.df <- cluster.5.snvs.10kb.df %>% mutate(dist = (as.numeric(cut(-V12, breaks = seq(-10000, 10000, 100)))-100)*100) %>% drop_na()
for (i in unique(cluster.5.snvs.10kb.df$dist)) {
cluster.5.vcf <- cluster.5.snvs.10kb.df %>% filter(dist == i) %>%
mutate(ID = paste("rs", V2, sep = ""), QUAL = ".", FILTER = ".", INFO = ".", FORMAT = ".") %>%
dplyr::select(V1, V2, ID, V4, V5, QUAL, FILTER, INFO, FORMAT)
colnames(cluster.5.vcf) <- c('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT')
vcffile <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_5/Bin_",i,".bed")
write.table(cluster.5.vcf, vcffile, sep="\t", col.names = T, row.names = F, quote = F)
}
Convert to vcf format
# bash/SLURM
# Cluster 1 - skin melanomas
for filename in /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_1/*.bed
do
path=${filename%.*}
file=${path##*/}
echo $file
awk 'BEGIN{print"##fileformat=VCFv4.1"}1' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_1/$file.bed > /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_1/$file.vcf
done
rm -rf /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_1
# Cluster 2 - diverse genomes
for filename in /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_2/*.bed
do
path=${filename%.*}
file=${path##*/}
echo $file
awk 'BEGIN{print"##fileformat=VCFv4.1"}1' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_2/$file.bed > /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_2/$file.vcf
done
rm -rf /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_2
# Cluster 3 - diverse genomes
for filename in /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_3/*.bed
do
path=${filename%.*}
file=${path##*/}
echo $file
awk 'BEGIN{print"##fileformat=VCFv4.1"}1' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_3/$file.bed > /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_3/$file.vcf
done
rm -rf /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_3
# Cluster 4 - chronic myeloid disorders
for filename in /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_4/*.bed
do
path=${filename%.*}
file=${path##*/}
echo $file
awk 'BEGIN{print"##fileformat=VCFv4.1"}1' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_4/$file.bed > /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_4/$file.vcf
done
rm -rf /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_4
# Cluster 5 - diverse genomes
for filename in /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_5/*.bed
do
path=${filename%.*}
file=${path##*/}
echo $file
awk 'BEGIN{print"##fileformat=VCFv4.1"}1' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_5/$file.bed > /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_5/$file.vcf
done
rm -rf /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster_5
Extract mutational count matrices
# r
library(dplyr)
library(MutationalPatterns)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load genome
ref.genome <- BSgenome.Hsapiens.UCSC.hg19
# Cluster 1 - skin melanomas
vcfFiles.cluster.1 <- grep("*.vcf$", dir("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_1/"), value = TRUE)
cluster.1.files <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_1/", vcfFiles.cluster.1)
cluster.1.sample.names <- vcfFiles.cluster.1 %>% str_replace(".vcf", "") %>% str_replace("Bin_", "")
cluster.1.snvs.grl <- read_vcfs_as_granges(cluster.1.files, cluster.1.sample.names, ref.genome)
saveRDS(cluster.1.snvs.grl, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.1.snvs.grl.rds")
# Cluster 2 - diverse genomes
vcfFiles.cluster.2 <- grep("*.vcf$", dir("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_2/"), value = TRUE)
cluster.2.files <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_2/", vcfFiles.cluster.2)
cluster.2.sample.names <- vcfFiles.cluster.2 %>% str_replace(".vcf", "") %>% str_replace("Bin_", "")
cluster.2.snvs.grl <- read_vcfs_as_granges(cluster.2.files, cluster.2.sample.names, ref.genome)
saveRDS(cluster.2.snvs.grl, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.2.snvs.grl.rds")
# Cluster 3 - diverse genomes
vcfFiles.cluster.3 <- grep("*.vcf$", dir("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_3/"), value = TRUE)
cluster.3.files <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_3/", vcfFiles.cluster.3)
cluster.3.sample.names <- vcfFiles.cluster.3 %>% str_replace(".vcf", "") %>% str_replace("Bin_", "")
cluster.3.snvs.grl <- read_vcfs_as_granges(cluster.3.files, cluster.3.sample.names, ref.genome)
saveRDS(cluster.3.snvs.grl, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.3.snvs.grl.rds")
# Cluster 4 - chronic myeloid disorders
vcfFiles.cluster.4 <- grep("*.vcf$", dir("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_4/"), value = TRUE)
cluster.4.files <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_4/", vcfFiles.cluster.4)
cluster.4.sample.names <- vcfFiles.cluster.4 %>% str_replace(".vcf", "") %>% str_replace("Bin_", "")
cluster.4.snvs.grl <- read_vcfs_as_granges(cluster.4.files, cluster.4.sample.names, ref.genome)
saveRDS(cluster.4.snvs.grl, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.4.snvs.grl.rds")
# Cluster 5 - diverse genomes
vcfFiles.cluster.5 <- grep("*.vcf$", dir("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_5/"), value = TRUE)
cluster.5.files <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/cluster_5/", vcfFiles.cluster.5)
cluster.5.sample.names <- vcfFiles.cluster.5 %>% str_replace(".vcf", "") %>% str_replace("Bin_", "")
cluster.5.snvs.grl <- read_vcfs_as_granges(cluster.5.files, cluster.5.sample.names, ref.genome)
saveRDS(cluster.5.snvs.grl, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.5.snvs.grl.rds")
# Load grange lists
cluster.1.snvs.grl <- readRDS("./002_Mut_signatures_Pan_cancer/sig/cluster.1.snvs.grl.rds")
cluster.2.snvs.grl <- readRDS("./002_Mut_signatures_Pan_cancer/sig/cluster.2.snvs.grl.rds")
cluster.3.snvs.grl <- readRDS("./002_Mut_signatures_Pan_cancer/sig/cluster.3.snvs.grl.rds")
cluster.4.snvs.grl <- readRDS("./002_Mut_signatures_Pan_cancer/sig/cluster.4.snvs.grl.rds")
cluster.5.snvs.grl <- readRDS("./002_Mut_signatures_Pan_cancer/sig/cluster.5.snvs.grl.rds")
# Compute mutation count matrices
cluster.1.mut.mat <- mut_matrix(vcf_list = cluster.1.snvs.grl, ref_genome = ref.genome)
cluster.2.mut.mat <- mut_matrix(vcf_list = cluster.2.snvs.grl, ref_genome = ref.genome)
cluster.3.mut.mat <- mut_matrix(vcf_list = cluster.3.snvs.grl, ref_genome = ref.genome)
cluster.4.mut.mat <- mut_matrix(vcf_list = cluster.4.snvs.grl, ref_genome = ref.genome)
cluster.5.mut.mat <- mut_matrix(vcf_list = cluster.5.snvs.grl, ref_genome = ref.genome)
# Save
saveRDS(cluster.1.mut.mat, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.1.mut.mat.rds")
saveRDS(cluster.2.mut.mat, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.2.mut.mat.rds")
saveRDS(cluster.3.mut.mat, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.3.mut.mat.rds")
saveRDS(cluster.4.mut.mat, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.4.mut.mat.rds")
saveRDS(cluster.5.mut.mat, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.5.mut.mat.rds")
Local exposure
# r
# Load mutation count matrices
cluster.1.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.1.mut.mat.rds")
cluster.2.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.2.mut.mat.rds")
cluster.3.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.3.mut.mat.rds")
cluster.4.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.4.mut.mat.rds")
cluster.5.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.5.mut.mat.rds")
# Load identified signatures
snvs.ori.mut.cluster <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.ori.mut.cluster.signatures.rds")
# Cluster 1 - skin melanomas
cluster.1.fit <- fit_to_signatures(cluster.1.mut.mat, snvs.ori.mut.cluster)
cluster.1.fit.df <- as.data.frame(cluster.1.fit$contribution)
cluster.1.fit.df <- as.data.frame(t(cluster.1.fit.df)) %>% add_rownames(var = "dist") %>% dplyr::select(dist, SBS.clust.1 = cluster.1) %>% mutate(SBS.clust.1.FC = SBS.clust.1/mean(SBS.clust.1[c(1:10,190:200)]))
cluster.1.fit.df$dist <- as.numeric(cluster.1.fit.df$dist)
cluster.1.norm.fit.df <- cluster.1.fit.df %>% dplyr::select(dist, value = SBS.clust.1.FC) %>% mutate(cluster = "cluster.1")
# Cluster 2 - diverse genomes
cluster.2.fit <- fit_to_signatures(cluster.2.mut.mat, snvs.ori.mut.cluster)
cluster.2.fit.df <- as.data.frame(cluster.2.fit$contribution)
cluster.2.fit.df <- as.data.frame(t(cluster.2.fit.df)) %>% add_rownames(var = "dist") %>% dplyr::select(dist, SBS.clust.2 = cluster.2) %>% mutate(SBS.clust.2.FC = (SBS.clust.2+1)/mean(SBS.clust.2[c(1:10,190:200)]))
cluster.2.fit.df$dist <- as.numeric(cluster.2.fit.df$dist)
cluster.2.norm.fit.df <- cluster.2.fit.df %>% dplyr::select(dist, value = SBS.clust.2.FC) %>% mutate(cluster = "cluster.2")
# Cluster 3 - diverse genomes
cluster.3.fit <- fit_to_signatures(cluster.3.mut.mat, snvs.ori.mut.cluster)
cluster.3.fit.df <- as.data.frame(cluster.3.fit$contribution)
cluster.3.fit.df <- as.data.frame(t(cluster.3.fit.df)) %>% add_rownames(var = "dist") %>% dplyr::select(dist, SBS.clust.3 = cluster.3) %>% mutate(SBS.clust.3.FC = SBS.clust.3/mean(SBS.clust.3[c(1:10,190:200)]))
cluster.3.fit.df$dist <- as.numeric(cluster.3.fit.df$dist)
cluster.3.norm.fit.df <- cluster.3.fit.df %>% dplyr::select(dist, value = SBS.clust.3.FC) %>% mutate(cluster = "cluster.3")
# Cluster 4 - chronic myeloid disorders
cluster.4.fit <- fit_to_signatures(cluster.4.mut.mat, snvs.ori.mut.cluster)
cluster.4.fit.df <- as.data.frame(cluster.4.fit$contribution)
cluster.4.fit.df <- as.data.frame(t(cluster.4.fit.df)) %>% add_rownames(var = "dist") %>% dplyr::select(dist, SBS.clust.4 = cluster.4) %>% mutate(SBS.clust.4.FC = (SBS.clust.4+1)/mean(SBS.clust.4[c(1:10,190:200)]))
cluster.4.fit.df$dist <- as.numeric(cluster.4.fit.df$dist)
cluster.4.norm.fit.df <- cluster.4.fit.df %>% dplyr::select(dist, value = SBS.clust.4.FC) %>% mutate(cluster = "cluster.4")
# Cluster 5 - diverse genomes
cluster.5.fit <- fit_to_signatures(cluster.5.mut.mat, snvs.ori.mut.cluster)
cluster.5.fit.df <- as.data.frame(cluster.5.fit$contribution)
cluster.5.fit.df <- as.data.frame(t(cluster.5.fit.df)) %>% add_rownames(var = "dist") %>% dplyr::select(dist, SBS.clust.5 = cluster.5) %>% mutate(SBS.clust.5.FC = SBS.clust.5/mean(SBS.clust.5[c(1:10,190:200)]))
cluster.5.fit.df$dist <- as.numeric(cluster.5.fit.df$dist)
cluster.5.norm.fit.df <- cluster.5.fit.df %>% dplyr::select(dist, value = SBS.clust.5.FC) %>% mutate(cluster = "cluster.5")
# Combine results and plot
cluster.summary.norm.fit.df <- rbind(cluster.1.norm.fit.df, cluster.2.norm.fit.df, cluster.3.norm.fit.df, cluster.4.norm.fit.df, cluster.5.norm.fit.df)
saveRDS(cluster.summary.norm.fit.df, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.summary.norm.fit.df.rds")
library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load results
cluster.summary.norm.fit.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.summary.norm.fit.df.rds")
# Plot
cluster.summary.norm.fit.plot.1 <- cluster.summary.norm.fit.df %>% filter(cluster == "cluster.1" | cluster == "cluster.3" | cluster == "cluster.5") %>%
ggplot(aes(x = dist, y = value, color = cluster)) +
geom_line() + xlim(-5000,5000) + ylim(0.5, 1.5) +
scale_color_manual(values = c("#0173B4", "#069F73", "#56B3E6")) +
geom_hline(yintercept=1, linetype="dashed") +
xlab("Distance from origins (bp)") + ylab("Signature contribution\n(fold change from background)") +
theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"))
cluster.summary.norm.fit.plot.1

cluster.summary.norm.fit.plot.2 <- cluster.summary.norm.fit.df %>% filter(cluster == "cluster.2" | cluster == "cluster.4") %>%
ggplot(aes(x = dist, y = value, color = cluster)) +
geom_line() + xlim(-5000,5000) + ylim(0, 10) +
scale_color_manual(values = c("#D56014", "#E6A004")) +
geom_hline(yintercept=1, linetype="dashed") +
xlab("Distance from origins (bp)") + ylab("Signature contribution\n(fold change from background)") +
theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"))
cluster.summary.norm.fit.plot.2

pdf("./Rplots/cluster.summary.norm.fit.plot.pdf", width=10, height=6, useDingbats=FALSE)
ggarrange(cluster.summary.norm.fit.plot.2, cluster.summary.norm.fit.plot.1, nrow = 1, ncol = 2)
dev.off()
## quartz_off_screen
## 2
Genome-wide mutation
signatures of identified clusters
Compute aggregated mutational signatures for each cluster
As a control, the genome-side signatures from genomes of each cluster
is extracted.
Prepare snsvs gr lists per cluster.
# r
library(dplyr)
library(stringr)
library(BSgenome.Hsapiens.UCSC.hg19)
# Open manifest
SSM.manifest.df <- read.table(gzfile("./Dataset/ICGC/SSM/manifest.1698683356752.tar.gz"), skip = 1)
SSM.manifest.df <- SSM.manifest.df %>% dplyr::select(V9, V10, V2, V3, V5)
colnames(SSM.manifest.df) <- c("donor.id", "project", "file.id", "object.id", "file.name")
snvs.manifest.df <- SSM.manifest.df %>% filter(str_detect(file.name, 'somatic.snv')) %>% separate(project, c("cancer.type", "project"), sep = "-")
# 1,950 entries, 1,830 donors
# Recover genomes to analyse
cluster.1.vcf.df <- snvs.manifest.df %>% filter(donor.id %in% cluster.1.id) # 42 genomes
cluster.2.vcf.df <- snvs.manifest.df %>% filter(donor.id %in% cluster.2.id) # 46 genomes
cluster.3.vcf.df <- snvs.manifest.df %>% filter(donor.id %in% cluster.3.id) # 29 genomes
cluster.4.vcf.df <- snvs.manifest.df %>% filter(donor.id %in% cluster.4.id) # 23 genomes
cluster.5.vcf.df <- snvs.manifest.df %>% filter(donor.id %in% cluster.5.id) # 93 genomes
# List vcf files
vcf.files <- list.files("./Dataset/ICGC/SSM/VCF/") # 3,892 files
vcf.files <- vcf.files[!grepl(".idx", vcf.files)] # 1,946 snvs vcf files
# per cluster
vcf.files.cluster.1 <- vcf.files[vcf.files %in% cluster.1.vcf.df$file.name] # 42 files
vcf.files.cluster.2 <- vcf.files[vcf.files %in% cluster.2.vcf.df$file.name] # 46 files
vcf.files.cluster.3 <- vcf.files[vcf.files %in% cluster.3.vcf.df$file.name] # 29 files
vcf.files.cluster.4 <- vcf.files[vcf.files %in% cluster.4.vcf.df$file.name] # 23 files
vcf.files.cluster.5 <- vcf.files[vcf.files %in% cluster.5.vcf.df$file.name] # 93 files
# define path
vcf.path.cluster.1 <- paste("./Dataset/ICGC/SSM/VCF/", vcf.files.cluster.1, sep = "")
vcf.path.cluster.2 <- paste("./Dataset/ICGC/SSM/VCF/", vcf.files.cluster.2, sep = "")
vcf.path.cluster.3 <- paste("./Dataset/ICGC/SSM/VCF/", vcf.files.cluster.3, sep = "")
vcf.path.cluster.4 <- paste("./Dataset/ICGC/SSM/VCF/", vcf.files.cluster.4, sep = "")
vcf.path.cluster.5 <- paste("./Dataset/ICGC/SSM/VCF/", vcf.files.cluster.5, sep = "")
# define path
vcf.name.cluster.1 <- cluster.1.vcf.df$donor.id
vcf.name.cluster.2 <- cluster.2.vcf.df$donor.id
vcf.name.cluster.3 <- cluster.3.vcf.df$donor.id
vcf.name.cluster.4 <- cluster.4.vcf.df$donor.id
vcf.name.cluster.5 <- cluster.5.vcf.df$donor.id
# Load vcf files in a gr lists
ref.genome <- BSgenome.Hsapiens.UCSC.hg19
cluster.1.grl <- read_vcfs_as_granges(vcf.path.cluster.1, vcf.name.cluster.1, ref.genome) # xx vcf
cluster.2.grl <- read_vcfs_as_granges(vcf.path.cluster.2, vcf.name.cluster.2, ref.genome) # xx vcf
cluster.3.grl <- read_vcfs_as_granges(vcf.path.cluster.3, vcf.name.cluster.3, ref.genome) # xx vcf
cluster.4.grl <- read_vcfs_as_granges(vcf.path.cluster.4, vcf.name.cluster.4, ref.genome) # xx vcf
cluster.5.grl <- read_vcfs_as_granges(vcf.path.cluster.5, vcf.name.cluster.5, ref.genome) # xx vcf
# Save grange list
saveRDS(cluster.1.grl, file = "./002_Mut_signatures_Pan_cancer/rds/cluster.1.grl.rds")
saveRDS(cluster.2.grl, file = "./002_Mut_signatures_Pan_cancer/rds/cluster.2.grl.rds")
saveRDS(cluster.3.grl, file = "./002_Mut_signatures_Pan_cancer/rds/cluster.3.grl.rds")
saveRDS(cluster.4.grl, file = "./002_Mut_signatures_Pan_cancer/rds/cluster.4.grl.rds")
saveRDS(cluster.5.grl, file = "./002_Mut_signatures_Pan_cancer/rds/cluster.5.grl.rds")
# Compute mutation matrices
cluster.1.mut.mat <- mut_matrix(vcf_list = unlist(cluster.1.grl), ref_genome = ref.genome)
cluster.2.mut.mat <- mut_matrix(vcf_list = unlist(cluster.2.grl), ref_genome = ref.genome)
cluster.3.mut.mat <- mut_matrix(vcf_list = unlist(cluster.3.grl), ref_genome = ref.genome)
cluster.4.mut.mat <- mut_matrix(vcf_list = unlist(cluster.4.grl), ref_genome = ref.genome)
cluster.5.mut.mat <- mut_matrix(vcf_list = unlist(cluster.5.grl), ref_genome = ref.genome)
# Modify and combine
colnames(cluster.1.mut.mat) <- c("cluster.1")
colnames(cluster.2.mut.mat) <- c("cluster.2")
colnames(cluster.3.mut.mat) <- c("cluster.3")
colnames(cluster.4.mut.mat) <- c("cluster.4")
colnames(cluster.5.mut.mat) <- c("cluster.5")
cluster.summary.mut.mat <- cbind.data.frame(cluster.1.mut.mat, cluster.2.mut.mat, cluster.3.mut.mat, cluster.4.mut.mat, cluster.5.mut.mat)
# Save mutation matrix
saveRDS(cluster.summary.mut.mat, file = "./002_Mut_signatures_Pan_cancer/rds/cluster.summary.mut.mat.rds")
# Assess per genome, per cluster genome-wide mutational burden
# Cluster 1
gw.snvs.count.summary.cluster.1.df <- tibble()
for (i in 1:length(cluster.1.grl)) {
sample.i <- names(cluster.1.grl@partitioning)[i]
count.i <- length(unlist(start(cluster.1.grl[i])))
gw.summary.i <- cbind.data.frame(donor.id = sample.i, gw.snvs.count = count.i, cluster = 1)
gw.snvs.count.summary.cluster.1.df <- rbind(gw.snvs.count.summary.cluster.1.df, gw.summary.i)
}
# Cluster 2
gw.snvs.count.summary.cluster.2.df <- tibble()
for (i in 1:length(cluster.2.grl)) {
sample.i <- names(cluster.2.grl@partitioning)[i]
count.i <- length(unlist(start(cluster.2.grl[i])))
gw.summary.i <- cbind.data.frame(donor.id = sample.i, gw.snvs.count = count.i, cluster = 2)
gw.snvs.count.summary.cluster.2.df <- rbind(gw.snvs.count.summary.cluster.2.df, gw.summary.i)
}
# Cluster 3
gw.snvs.count.summary.cluster.3.df <- tibble()
for (i in 1:length(cluster.3.grl)) {
sample.i <- names(cluster.3.grl@partitioning)[i]
count.i <- length(unlist(start(cluster.3.grl[i])))
gw.summary.i <- cbind.data.frame(donor.id = sample.i, gw.snvs.count = count.i, cluster = 3)
gw.snvs.count.summary.cluster.3.df <- rbind(gw.snvs.count.summary.cluster.3.df, gw.summary.i)
}
# Cluster 4
gw.snvs.count.summary.cluster.4.df <- tibble()
for (i in 1:length(cluster.4.grl)) {
sample.i <- names(cluster.4.grl@partitioning)[i]
count.i <- length(unlist(start(cluster.4.grl[i])))
gw.summary.i <- cbind.data.frame(donor.id = sample.i, gw.snvs.count = count.i, cluster = 4)
gw.snvs.count.summary.cluster.4.df <- rbind(gw.snvs.count.summary.cluster.4.df, gw.summary.i)
}
# Cluster 5
gw.snvs.count.summary.cluster.5.df <- tibble()
for (i in 1:length(cluster.5.grl)) {
sample.i <- names(cluster.5.grl@partitioning)[i]
count.i <- length(unlist(start(cluster.5.grl[i])))
gw.summary.i <- cbind.data.frame(donor.id = sample.i, gw.snvs.count = count.i, cluster = 5)
gw.snvs.count.summary.cluster.5.df <- rbind(gw.snvs.count.summary.cluster.5.df, gw.summary.i)
}
gw.snvs.count.summary.cluster.df <- rbind(gw.snvs.count.summary.cluster.1.df, gw.snvs.count.summary.cluster.2.df,
gw.snvs.count.summary.cluster.3.df, gw.snvs.count.summary.cluster.4.df,
gw.snvs.count.summary.cluster.5.df)
# Save snvs count summary
saveRDS(gw.snvs.count.summary.cluster.df, file = "./002_Mut_signatures_Pan_cancer/rds/gw.snvs.count.summary.cluster.df.rds")
Plot genome-wide signatures and mutational burden.
# r
library(dplyr)
library(MutationalPatterns)
library(ggplot2)
library(scales)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load mutation matrix
cluster.summary.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/cluster.summary.mut.mat.rds")
# Plot 96 profiles
plot_96_profile(cluster.summary.mut.mat, condensed = TRUE, ymax = 0.3)

# Save profiles
pdf("./Rplots/snvs.gw.mut.cluster.pdf", width=5, height=6, useDingbats=FALSE)
plot_96_profile(cluster.summary.mut.mat, condensed = TRUE, ymax = 0.3)
dev.off()
## quartz_off_screen
## 2
# Compute cosine similarities between origin and genome-wide signatures
# Load origin associated signatures
snvs.ori.mut.cluster <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.ori.mut.cluster.signatures.rds")
# Compute cosine similarites
cos_sim(snvs.ori.mut.cluster[, 1], cluster.summary.mut.mat[, 1]) # 0.8561225
## [1] 0.8561225
cos_sim(snvs.ori.mut.cluster[, 2], cluster.summary.mut.mat[, 2]) # 0.2775516
## [1] 0.2775516
cos_sim(snvs.ori.mut.cluster[, 3], cluster.summary.mut.mat[, 3]) # 0.5507919
## [1] 0.5507919
cos_sim(snvs.ori.mut.cluster[, 4], cluster.summary.mut.mat[, 4]) # 0.4684654
## [1] 0.4684654
cos_sim(snvs.ori.mut.cluster[, 5], cluster.summary.mut.mat[, 5]) # 0.7121845
## [1] 0.7121845
Comparison of
origin-associated signatures with known signatures in cancers
Cosine similarity
with COSMIC signatures
# Recover known signatures (COSMIC signatures)
COSMIC.signatures <- as.matrix(read.table("./Dataset/COSMIC/COSMIC_v3.2_SBS_GRCh37.txt", header = T, row.names = 1))
# Reorder rows
new_order <- sapply(rownames(snvs.ori.mut.cluster), function(x,df){which(rownames(COSMIC.signatures) == x)}, df=COSMIC.signatures)
COSMIC.signatures <- COSMIC.signatures[new_order,]
# Compute cosine similarity matrix
snvs.ori.COSMIC.sim.mat <- cos_sim_matrix(COSMIC.signatures, snvs.ori.mut.cluster)
snvs.ori.COSMIC.sim.mat.melt <- melt(snvs.ori.COSMIC.sim.mat)
# Save results
saveRDS(snvs.ori.COSMIC.sim.mat.melt, "./002_Mut_signatures_Pan_cancer/rds/snvs.ori.COSMIC.sim.mat.melt.rds")
Plot
# r
library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load cosine similarity matrix
snvs.ori.COSMIC.sim.mat.melt <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/snvs.ori.COSMIC.sim.mat.melt.rds")
# Plot
snvs.ori.COSMIC.sim.heatmap <- snvs.ori.COSMIC.sim.mat.melt %>% ggplot(aes(x=Var2, y=Var1, fill=value)) + geom_tile(color = "white") +
scale_fill_gradient2(low = "#4F4A75", high = "#E41F1A", mid = "white", midpoint = 0.4, limit = c(0,1), space = "Lab", name="Cosine\nSimilarity") +
theme_minimal() + theme(aspect.ratio=10, axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1), axis.text.y = element_text(size = 12), axis.title.x=element_blank(), axis.title.y=element_blank())
snvs.ori.COSMIC.sim.heatmap

pdf("./Rplots/snvs.ori.COSMIC.sim.heatmap.pdf", width=3, height=10, useDingbats=FALSE)
snvs.ori.COSMIC.sim.heatmap
dev.off()
## quartz_off_screen
## 2
The analysis shows that: - cluster 1 has high similarity with SBS7b
(cos = 0.835) and SBS7a (cos = 0.793) associated with UV light exposure.
This suggests that mutagenesis at origins in skin melanoma is due to
defective NER. - SBS43 is the signature with highest similarity to
cluster 2 (cos = 0.945). This is a signature of unknown etiology. This
may reflect an unknown mutational process operating at origins. -
cluster 3 has high similarity with SBS10b (cos = 0.642) associated with
mutations within polymerase epsilon exonuclease domain. - cluster 4 has
high similarity with SBS84 (cos = 0.877) reflecting the activity of
activation-induced cytidine deaminase (AID). This suggests that AID
hyperactivity in lymphomas focuses mutagenesis at origins. - cluster 5
has high similarity with SBS5 (cos = 0.846) which is a flat and
clock-like signature of unknown etiology. This cluster may then reflect
background mutagenesis.
COSMIC signatures
refitting
Assess the local exposure of relevant COSMIC signatures. COSMIC
signatures with incompatible etiology are excluded (e.g. chemo
treatment).
Only signatures contributing for at least 50 snvs in a bin.
# r
# Load COSMIC signatures
COSMIC.signatures <- as.matrix(read.table("./Dataset/COSMIC/COSMIC_v3.2_SBS_GRCh37.txt", header = T, row.names = 1))
# Reorder rows
COSMIC.signatures <- COSMIC.signatures[rownames(snvs.ori.mut.cluster),,drop=FALSE]
# Select COSMIC signatures to exclude
COSMIC.ex <- c("SBS4", "SBS11", "SBS22", "SBS24", "SBS25", "SBS29", "SBS31", "SBS32", "SBS35", "SBS42", "SBS86", "SBS87", "SBS88", "SBS90")
COSMIC.filter <- colnames(COSMIC.signatures)[!colnames(COSMIC.signatures) %in% COSMIC.ex]
COSMIC.sigs <- as.data.frame(COSMIC.signatures) %>% dplyr::select(all_of(COSMIC.filter)) %>% as.matrix()
# Load mutation count matrices
cluster.1.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.1.mut.mat.rds")
cluster.2.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.2.mut.mat.rds")
cluster.3.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.3.mut.mat.rds")
cluster.4.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.4.mut.mat.rds")
cluster.5.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.5.mut.mat.rds")
# Cluster 1 - skin melanomas
cluster.1.COSMIC.fit <- fit_to_signatures_strict(cluster.1.mut.mat, COSMIC.sigs, max_delta = 0.004)
cluster.1.COSMIC.fit.df <- as.data.frame(cluster.1.COSMIC.fit$fit_res$contribution)
cluster.1.COSMIC.fit.df <- as.data.frame(t(cluster.1.COSMIC.fit.df))
cluster.1.COSMIC.fit.filter.df <- cluster.1.COSMIC.fit.df %>% dplyr::select_if(~any(. >= 50)) %>% add_rownames(var = "dist") %>% gather("sig", "value", -dist)
saveRDS(cluster.1.COSMIC.fit.filter.df, "./002_Mut_signatures_Pan_cancer/rds/cluster.1.COSMIC.fit.filter.df.rds")
# Cluster 2 - diverse genomes
cluster.2.COSMIC.fit <- fit_to_signatures_strict(cluster.2.mut.mat, COSMIC.sigs, max_delta = 0.004)
cluster.2.COSMIC.fit.df <- as.data.frame(cluster.2.COSMIC.fit$fit_res$contribution)
cluster.2.COSMIC.fit.df <- as.data.frame(t(cluster.2.COSMIC.fit.df))
cluster.2.COSMIC.fit.filter.df <- cluster.2.COSMIC.fit.df %>% dplyr::select_if(~any(. >= 50)) %>% add_rownames(var = "dist") %>% gather("sig", "value", -dist)
saveRDS(cluster.2.COSMIC.fit.filter.df, "./002_Mut_signatures_Pan_cancer/rds/cluster.2.COSMIC.fit.filter.df.rds")
# Cluster 3 - diverse genomes
cluster.3.COSMIC.fit <- fit_to_signatures_strict(cluster.3.mut.mat, COSMIC.sigs, max_delta = 0.004)
cluster.3.COSMIC.fit.df <- as.data.frame(cluster.3.COSMIC.fit$fit_res$contribution)
cluster.3.COSMIC.fit.df <- as.data.frame(t(cluster.3.COSMIC.fit.df))
cluster.3.COSMIC.fit.filter.df <- cluster.3.COSMIC.fit.df %>% dplyr::select_if(~any(. >= 50)) %>% add_rownames(var = "dist") %>% gather("sig", "value", -dist)
saveRDS(cluster.3.COSMIC.fit.filter.df, "./002_Mut_signatures_Pan_cancer/rds/cluster.3.COSMIC.fit.filter.df.rds")
# Cluster 4 - malignant lymphomas
cluster.4.COSMIC.fit <- fit_to_signatures_strict(cluster.4.mut.mat, COSMIC.sigs, max_delta = 0.004)
cluster.4.COSMIC.fit.df <- as.data.frame(cluster.4.COSMIC.fit$fit_res$contribution)
cluster.4.COSMIC.fit.df <- as.data.frame(t(cluster.4.COSMIC.fit.df))
cluster.4.COSMIC.fit.filter.df <- cluster.4.COSMIC.fit.df %>% dplyr::select_if(~any(. >= 50)) %>% add_rownames(var = "dist") %>% gather("sig", "value", -dist)
saveRDS(cluster.4.COSMIC.fit.filter.df, "./002_Mut_signatures_Pan_cancer/rds/cluster.4.COSMIC.fit.filter.df.rds")
# Cluster 5 - diverse genomes
cluster.5.COSMIC.fit <- fit_to_signatures_strict(cluster.5.mut.mat, COSMIC.sigs, max_delta = 0.004)
cluster.5.COSMIC.fit.df <- as.data.frame(cluster.5.COSMIC.fit$fit_res$contribution)
cluster.5.COSMIC.fit.df <- as.data.frame(t(cluster.5.COSMIC.fit.df))
cluster.5.COSMIC.fit.filter.df <- cluster.5.COSMIC.fit.df %>% dplyr::select_if(~any(. >= 50)) %>% add_rownames(var = "dist") %>% gather("sig", "value", -dist)
saveRDS(cluster.5.COSMIC.fit.filter.df, "./002_Mut_signatures_Pan_cancer/rds/cluster.5.COSMIC.fit.filter.df.rds")
Plot exposures
# r
library(dplyr)
library(ggplot2)
library(ggpubr)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Cluster 1 - skin melanomas
cluster.1.COSMIC.fit.filter.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/cluster.1.COSMIC.fit.filter.df.rds")
cluster.1.COSMIC.fit.filter.plot <- cluster.1.COSMIC.fit.filter.df %>%
ggplot(aes(x=as.numeric(dist), y=value, colour=sig)) + geom_line() +
scale_color_manual(values = c("#E4221E", "#0775B6")) +
xlim(-5000,5000) +
xlab("Distance from origin (bp)") + ylab("COSMIC signature exposure\n(absolute contribution)") + ggtitle("Cluster 1") +
theme_bw() + theme(aspect.ratio=1.0, panel.grid.minor = element_line(linetype = "dotted"))
cluster.1.COSMIC.fit.filter.plot

# Cluster 2 - diverse genomes
cluster.2.COSMIC.fit.filter.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/cluster.2.COSMIC.fit.filter.df.rds")
cluster.2.COSMIC.fit.filter.plot <- cluster.2.COSMIC.fit.filter.df %>%
ggplot(aes(x=as.numeric(dist), y=value, colour=sig)) + geom_line() +
scale_color_manual(values = c("#4F4A75", "#E4221E", "#0775B6", "#D56114")) +
xlim(-5000,5000) +
xlab("Distance from origin (bp)") + ylab("COSMIC signature exposure\n(absolute contribution)") + ggtitle("Cluster 2") +
theme_bw() + theme(aspect.ratio=1.0, panel.grid.minor = element_line(linetype = "dotted"))
cluster.2.COSMIC.fit.filter.plot

# Cluster 3 - diverse genomes
cluster.3.COSMIC.fit.filter.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/cluster.3.COSMIC.fit.filter.df.rds")
c3.pal <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#CC79A7", "#0072B2", "#D55E00")
cluster.3.COSMIC.fit.filter.plot <- cluster.3.COSMIC.fit.filter.df %>%
ggplot(aes(x=as.numeric(dist), y=value, colour=sig)) + geom_line() +
scale_color_manual(values = c3.pal) +
xlim(-5000,5000) +
xlab("Distance from origin (bp)") + ylab("COSMIC signature exposure\n(absolute contribution)") + ggtitle("Cluster 3") +
theme_bw() + theme(aspect.ratio=1.0, panel.grid.minor = element_line(linetype = "dotted"))
cluster.3.COSMIC.fit.filter.plot

# Cluster 4 - malignant lymphomas
cluster.4.COSMIC.fit.filter.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/cluster.4.COSMIC.fit.filter.df.rds")
c4.pal <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#E4221E", "#0072B2", "#D55E00")
cluster.4.COSMIC.fit.filter.plot <- cluster.4.COSMIC.fit.filter.df %>%
ggplot(aes(x=as.numeric(dist), y=value, colour=sig)) + geom_line() +
scale_color_manual(values = c4.pal) +
xlim(-5000,5000) +
xlab("Distance from origin (bp)") + ylab("COSMIC signature exposure\n(absolute contribution)") + ggtitle("Cluster 4") +
theme_bw() + theme(aspect.ratio=1.0, panel.grid.minor = element_line(linetype = "dotted"))
cluster.4.COSMIC.fit.filter.plot

# Cluster 5 - diverse genomes
cluster.5.COSMIC.fit.filter.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/cluster.5.COSMIC.fit.filter.df.rds")
c5.pal <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#E4221E", "#0072B2", "#D55E00")
# To many signature to plot, select only signatures with signal at origins
cluster.5.sel.COSMIC.sig <- cluster.5.COSMIC.fit.filter.df %>% filter(dist == 0 & value > 0)
cluster.5.COSMIC.fit.filter.plot <- cluster.5.COSMIC.fit.filter.df %>%
filter(sig %in% cluster.5.sel.COSMIC.sig$sig) %>%
ggplot(aes(x=as.numeric(dist), y=value, colour=sig)) + geom_line() +
scale_color_manual(values = c5.pal) +
xlim(-5000,5000) +
xlab("Distance from origin (bp)") + ylab("COSMIC signature exposure\n(absolute contribution)") + ggtitle("Cluster 5") +
theme_bw() + theme(aspect.ratio=1.0, panel.grid.minor = element_line(linetype = "dotted"))
cluster.5.COSMIC.fit.filter.plot

pdf("./Rplots/COSMIC.refit.pdf", width=14, height=8, useDingbats=FALSE)
ggarrange(cluster.1.COSMIC.fit.filter.plot, cluster.2.COSMIC.fit.filter.plot, cluster.3.COSMIC.fit.filter.plot, cluster.4.COSMIC.fit.filter.plot, cluster.5.COSMIC.fit.filter.plot, ncol = 3, nrow = 2)
dev.off()
## quartz_off_screen
## 2
NER efficiency at
origins in skin melanomas
Cluster 1 of mutational signatures suggest differential NER activity
at replication origins compared to background.
In this section, we aim to characterise NER efficiency at origins and
correlation with local variation in mutation burden.
NER efficiency is assessed using XR-seq data from Hu et al. GENES
& DEVELOPMENT 2015 29:948–960 available at https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE67941
data is hg19
NER activity at
origins
We first aim to demonstrate that origin sites are prone to damage
induced by UV exposure as there are site of increased accessibility.
To assess accessibility we use DHS data from ENCODE for GM04504 cells
(Normal Human Dermal Fibroblasts (NHDF)) recovered from https://www.encodeproject.org/experiments/ENCSR000EMP/
(replicate 1 - ENCAN140WTK).
NER activity is assessed from XR-seq signal at consitutive origins
mapping two UV-induced DNA damages: cyclo-butane pyrimidine dimers
(CPDs) and (6-4) pyrimidine–pyrimidone photoproducts [(6-4)PPs]. We use
data from a CS-B (ERCC6) mutant NHF1 cell line to assess global NER
rather than transcription-coupled NER. As XR-seq is a stranded technique
we first considered aggregated data form plus and minus strands.
# r
# Load XR-seq data for CSB cells
# CPD
XRseq.CPD.plus.bw <- import.bw("./Dataset/Hu_2015/BW/GSE67941_CSBCPD_Merged_PLUS_UNIQUE_NORM_fixedStep_25.bw")
XRseq.CPD.minus.bw <- import.bw("./Dataset/Hu_2015/BW/GSE67941_CSBCPD_Merged_MINUS_UNIQUE_NORM_fixedStep_25.bw")
seqlevelsStyle(XRseq.CPD.plus.bw) <- "NCBI"
seqlevelsStyle(XRseq.CPD.minus.bw) <- "NCBI"
# 64PP
XRseq.64PP.plus.bw <- import.bw("./Dataset/Hu_2015/BW/GSE67941_CSB64_Merged_PLUS_UNIQUE_NORM_fixedStep_25.bw")
XRseq.64PP.minus.bw <- import.bw("./Dataset/Hu_2015/BW/GSE67941_CSB64_Merged_MINUS_UNIQUE_NORM_fixedStep_25.bw")
seqlevelsStyle(XRseq.64PP.plus.bw) <- "NCBI"
seqlevelsStyle(XRseq.64PP.minus.bw) <- "NCBI"
# Combine both plus and minus sequencing information
# CSB cells - CPD
XRseq.CPD.bw <- XRseq.CPD.plus.bw
XRseq.CPD.bw$plus <- XRseq.CPD.bw$score
XRseq.CPD.bw$minus <- XRseq.CPD.minus.bw$score
XRseq.CPD.bw$score <- XRseq.CPD.bw$plus + XRseq.CPD.bw$minus
export.bw(XRseq.CPD.bw, "./Dataset/Hu_2015/BW/XRseq.CSB.CPD.unstranded.bw")
# CSB cells - 64PP
XRseq.64PP.bw <- XRseq.64PP.plus.bw
XRseq.64PP.bw$plus <- XRseq.64PP.bw$score
XRseq.64PP.bw$minus <- XRseq.64PP.minus.bw$score
XRseq.64PP.bw$score <- XRseq.64PP.bw$plus + XRseq.64PP.bw$minus
export.bw(XRseq.64PP.bw, "./Dataset/Hu_2015/BW/XRseq.CSB.64PP.unstranded.bw")
# Association between NER activity and accessibility at origins
XRseq.CPD.bw <- import.bw("./Dataset/Hu_2015/BW/XRseq.CSB.CPD.unstranded.bw")
XRseq.64PP.bw <- import.bw("./Dataset/Hu_2015/BW/XRseq.CSB.64PP.unstranded.bw")
# Load DHS data
DHS.GM04504.bw <- import.bw("./Dataset/DHS/ENCFF228HOM_DHS_GM04504.bigWig")
seqlevelsStyle(DHS.GM04504.bw) <- "NCBI"
# Compute matrices
# For XRseq (CSB NHF1 cells - CPD)
XRseq.CSB.CPD.ori.mat <- normalizeToMatrix(XRseq.CPD.bw, ori.gr, value_column = "score", mean_mode = "coverage", extend = 5000, w = 100, background = 0)
saveRDS(XRseq.CSB.CPD.ori.mat, "./002_Mut_signatures_Pan_cancer/rds/XRseq.CSB.CPD.ori.mat.rds")
# For XRseq (CSB NHF1 cells - 64PP)
XRseq.CSB.64PP.ori.mat <- normalizeToMatrix(XRseq.64PP.bw, ori.gr, value_column = "score", mean_mode = "coverage", extend = 5000, w = 100, background = 0)
saveRDS(XRseq.CSB.64PP.ori.mat, "./002_Mut_signatures_Pan_cancer/rds/XRseq.CSB.64PP.ori.mat.rds")
# For DHS
DHS.ori.mat <- normalizeToMatrix(DHS.GM04504.bw, ori.gr, value_column = "score", mean_mode = "coverage", extend = 5000, w = 100, background = 0)
saveRDS(DHS.ori.mat, "./002_Mut_signatures_Pan_cancer/rds/DHS.ori.mat.rds")
col.fun.CPD <- colorRamp2(quantile(XRseq.CSB.CPD.ori.mat, c(0, 0.95)), c("white", "#0775B6"))
col.fun.64PP <- colorRamp2(quantile(XRseq.CSB.64PP.ori.mat, c(0, 0.95)), c("white", "#0775B6"))
col.fun.DHS <- colorRamp2(quantile(DHS.ori.mat, c(0, 0.95)), c("white", "#E4221E"))
pdf("./Rplots/XRseq.DHS.heatmaps.pdf", width=6, height=4, useDingbats=FALSE)
EnrichedHeatmap(XRseq.CSB.CPD.ori.mat, col = col.fun.CPD, name = "XRseq - CPD", pos_line = F) +
EnrichedHeatmap(XRseq.CSB.64PP.ori.mat, col = col.fun.64PP, name = "XRseq - 64PP", pos_line = F) +
EnrichedHeatmap(DHS.ori.mat, col = col.fun.DHS, name = "DHS", pos_line = F)
dev.off()
# The resulting pdf is subsequently modified and saved as a png
Assess correlation between DHS signal and NER activity
# r
# Assess correlation between DHS signal and NER activity
# Compute mean signals over origins (1 kb centered at origins - 10 bins)
XRseq.CSB.CPD.ori.val <- rowMeans(XRseq.CSB.CPD.ori.mat[,45:54], na.rm = T)
XRseq.CSB.64PP.ori.val <- rowMeans(XRseq.CSB.64PP.ori.mat[,45:54], na.rm = T)
DHS.ori.val <- rowMeans(DHS.ori.mat[,45:54], na.rm = T)
# Combine information, format and plot
XRseq.DHS.CPD.df <- cbind.data.frame(ori.id = ori.gr$ori.id, XRseq.CPD = XRseq.CSB.CPD.ori.val, XRseq.64PP = XRseq.CSB.64PP.ori.val, DHS = DHS.ori.val) %>%
mutate(DHS.bin = ntile(DHS, 20)) %>% group_by(DHS.bin) %>%
summarise(DHS = mean(DHS, na.rm = T), XRseq.mean = mean(XRseq.CPD, na.rm = T), XRseq.sem = std.error(XRseq.CPD, na.rm = T)) %>% mutate(exp = "CPD")
XRseq.DHS.64PP.df <- cbind.data.frame(ori.id = ori.gr$ori.id, XRseq.CPD = XRseq.CSB.CPD.ori.val, XRseq.64PP = XRseq.CSB.64PP.ori.val, DHS = DHS.ori.val) %>%
mutate(DHS.bin = ntile(DHS, 20)) %>% group_by(DHS.bin) %>%
summarise(DHS = mean(DHS, na.rm = T), XRseq.mean = mean(XRseq.64PP, na.rm = T), XRseq.sem = std.error(XRseq.64PP, na.rm = T)) %>% mutate(exp = "64PP")
XRseq.DHS.summary.df <- rbind(XRseq.DHS.CPD.df, XRseq.DHS.64PP.df)
# Assess corelations
cor.test(DHS.ori.val, XRseq.CSB.CPD.ori.val) # Rho = 0.4012731, p-value < 2.2e-16
cor.test(DHS.ori.val, XRseq.CSB.64PP.ori.val) # Rho = 0.1686478, p-value < 2.2e-16
# Save
saveRDS(XRseq.DHS.summary.df, "./002_Mut_signatures_Pan_cancer/rds/XRseq.DHS.summary.df.rds")
library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")
XRseq.DHS.summary.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/XRseq.DHS.summary.df.rds")
XRseq.DHS.summary.plot <- XRseq.DHS.summary.df %>%
ggplot(aes(x=DHS, y=XRseq.mean, color = exp)) +
geom_line() +
geom_errorbar(aes(ymin=XRseq.mean-XRseq.sem, ymax=XRseq.mean+XRseq.sem), width=.2, position=position_dodge(0.05)) +
geom_point(shape = 21, size = 2, fill = "white") +
scale_color_manual(values = c("#E41F1A", "#0474B5")) +
scale_x_continuous(trans = log10_trans(), breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) +
ylab("NER activity (normalised read count)") + xlab("DHS coverage bin (log10)") + ggtitle("Rho = 0.401, 0.168, p-val < 2.2e-16") +
theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"))
XRseq.DHS.summary.plot

pdf("./Rplots/XRseq.DHS.summary.plot.pdf", width=7, height=6, useDingbats=FALSE)
XRseq.DHS.summary.plot
dev.off()
## quartz_off_screen
## 2
NER efficiency at
origins
Report NER efficiency at origins - we now consider strand specific
data.
# r
library(dplyr)
library(tidyr)
library(rtracklayer)
library(EnrichedHeatmap)
library(ggplot2)
library(plotrix)
library(scales)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load origin center coordinates
ori.df <- read.table("./Dataset/ori/ori.inter.hg19.NCBI.bed", sep="\t",)
colnames(ori.df) <- c("seqnames", "start", "end", "EFF", "ori.id", "ori.width")
ori.gr <- makeGRangesFromDataFrame(ori.df, keep.extra.columns = T)
# Load XR-seq data
# We use data from a CS-B (ERCC6) mutant NHF1 cell line to assess global NER rather than transcription-coupled NER
# from Hu et al. GENES & DEVELOPMENT 2015 29:948–960
# downloaded at https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE67941
XRseq.CPD.plus.bw <- import.bw("./Dataset/Hu_2015/BW/GSE67941_CSBCPD_Merged_PLUS_UNIQUE_NORM_fixedStep_25.bw")
XRseq.CPD.minus.bw <- import.bw("./Dataset/Hu_2015/BW/GSE67941_CSBCPD_Merged_MINUS_UNIQUE_NORM_fixedStep_25.bw")
seqlevelsStyle(XRseq.CPD.plus.bw) <- "NCBI"
seqlevelsStyle(XRseq.CPD.minus.bw) <- "NCBI"
XRseq.64PP.plus.bw <- import.bw("./Dataset/Hu_2015/BW/GSE67941_CSB64_Merged_PLUS_UNIQUE_NORM_fixedStep_25.bw")
XRseq.64PP.minus.bw <- import.bw("./Dataset/Hu_2015/BW/GSE67941_CSB64_Merged_MINUS_UNIQUE_NORM_fixedStep_25.bw")
seqlevelsStyle(XRseq.64PP.plus.bw) <- "NCBI"
seqlevelsStyle(XRseq.64PP.minus.bw) <- "NCBI"
# Compute NER activity at origins
# For CPD
XRseq.CPD.plus.ori.mat <- normalizeToMatrix(XRseq.CPD.plus.bw, ori.gr, value_column = "score", mean_mode = "coverage", extend = 10000, w = 100, background = 0)
saveRDS(XRseq.CPD.plus.ori.mat, "./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.plus.ori.mat.rds")
XRseq.CPD.minus.ori.mat <- normalizeToMatrix(XRseq.CPD.minus.bw, ori.gr, value_column = "score", mean_mode = "coverage", extend = 10000, w = 100, background = 0)
saveRDS(XRseq.CPD.minus.ori.mat, "./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.minus.ori.mat.rds")
# For 6-4PP
XRseq.64PP.plus.ori.mat <- normalizeToMatrix(XRseq.64PP.plus.bw, ori.gr, value_column = "score", mean_mode = "coverage", extend = 10000, w = 100, background = 0)
saveRDS(XRseq.64PP.plus.ori.mat, "./002_Mut_signatures_Pan_cancer/rds/XRseq.64PP.plus.ori.mat.rds")
XRseq.64PP.minus.ori.mat <- normalizeToMatrix(XRseq.64PP.minus.bw, ori.gr, value_column = "score", mean_mode = "coverage", extend = 10000, w = 100, background = 0)
saveRDS(XRseq.64PP.minus.ori.mat, "./002_Mut_signatures_Pan_cancer/rds/XRseq.64PP.minus.ori.mat.rds")
Plot
# r
library(dplyr)
library(ggplot2)
library(ggpubr)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load data
XRseq.CPD.plus.ori.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.plus.ori.mat.rds")
XRseq.CPD.minus.ori.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.minus.ori.mat.rds")
XRseq.64PP.plus.ori.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/XRseq.64PP.plus.ori.mat.rds")
XRseq.64PP.minus.ori.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/XRseq.64PP.minus.ori.mat.rds")
# Format
# For CPD
XRseq.CPD.plus.ori.df <- as.data.frame(XRseq.CPD.plus.ori.mat) %>% summarise_all(~ mean(.x, na.rm = TRUE)) %>% t() %>% `colnames<-`(.[1, ]) %>% .[-1, ] %>% as_tibble() %>% mutate(dist = seq(-9900,10000,100)) %>% mutate(strand = "plus")
XRseq.CPD.minus.ori.df <- as.data.frame(XRseq.CPD.minus.ori.mat) %>% summarise_all(~ mean(.x, na.rm = TRUE)) %>% t() %>% `colnames<-`(.[1, ]) %>% .[-1, ] %>% as_tibble() %>% mutate(dist = seq(-9900,10000,100)) %>% mutate(strand = "minus")
XRseq.CPD.ori.df <- rbind(XRseq.CPD.plus.ori.df, XRseq.CPD.minus.ori.df)
# Compute strand bias
XRseq.CPD.sb.df <- (XRseq.CPD.plus.ori.df %>% dplyr::select(dist, plus = value)) %>% left_join((XRseq.CPD.minus.ori.df %>% dplyr::select(dist, minus = value)), by = "dist") %>% mutate(strand.bias = plus - minus)
# For (6-4)PP
XRseq.64PP.plus.ori.df <- as.data.frame(XRseq.64PP.plus.ori.mat) %>% summarise_all(~ mean(.x, na.rm = TRUE)) %>% t() %>% `colnames<-`(.[1, ]) %>% .[-1, ] %>% as_tibble() %>% mutate(dist = seq(-9900,10000,100)) %>% mutate(strand = "plus")
XRseq.64PP.minus.ori.df <- as.data.frame(XRseq.64PP.minus.ori.mat) %>% summarise_all(~ mean(.x, na.rm = TRUE)) %>% t() %>% `colnames<-`(.[1, ]) %>% .[-1, ] %>% as_tibble() %>% mutate(dist = seq(-9900,10000,100)) %>% mutate(strand = "minus")
XRseq.64PP.ori.df <- rbind(XRseq.64PP.plus.ori.df, XRseq.64PP.minus.ori.df)
# Plot
XRseq.CPD.ori.plot <- XRseq.CPD.ori.df %>% ggplot(aes(x = dist, y = value, class = strand)) +
geom_line(aes(linetype = strand), color = "#0474B5") + xlim(-5000, 5000) +
scale_linetype_manual(values=c("twodash", "solid")) +
xlab("Distance from origin (bp)") + ylab("NER efficiency (normalised read count)") + ggtitle("CPD") +
theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"))
XRseq.CPD.ori.plot

XRseq.64PP.ori.plot <- XRseq.64PP.ori.df %>% ggplot(aes(x = dist, y = value, class = strand)) +
geom_line(aes(linetype = strand), color = "#E41F1A") + xlim(-5000, 5000) +
scale_linetype_manual(values=c("twodash", "solid")) +
xlab("Distance from origin (bp)") + ylab("NER efficiency (normalised read count)") + ggtitle("(6-4)PP") +
theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"))
XRseq.64PP.ori.plot

pdf("./Rplots/XRseq.ori.plot.pdf", width=10, height=6, useDingbats=FALSE)
ggarrange(XRseq.CPD.ori.plot, XRseq.64PP.ori.plot, nrow = 1, ncol = 2)
dev.off()
## quartz_off_screen
## 2
Plot the distribution of snvs at origins for cluster 1 genome for
comparison
# r
# Recover origin-associated snvs in cluster 1
cluster.1.snvs.10kb.df <- read.table("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.1.snvs.10kb.bed", sep = "\t")
colnames(cluster.1.snvs.10kb.df) <- c("seqnames", "start", "end", "REF", "ALT", "seqnames.ori", "start.ori", "end.ori", "EFF", "ori.id", "ori.width", "dist")
# Compute snvs density in mutations per Mb (9,341 origins - 42 genomes)
cluster.1.snvs.dist.df <- cluster.1.snvs.10kb.df %>% mutate(dist = (as.numeric(cut(-dist, breaks = seq(-10000, 10000, 100)))-100)*100) %>% drop_na() %>% group_by(dist) %>% summarise(snvs.count = n()) %>% mutate(snvs.dens = ((snvs.count/(9341*42))*10e6)/100)
saveRDS(cluster.1.snvs.dist.df, "./002_Mut_signatures_Pan_cancer/rds/cluster.1.snvs.dist.df.rds")
library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")
cluster.1.snvs.dist.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/cluster.1.snvs.dist.df.rds")
cluster.1.snvs.dist.plot <- cluster.1.snvs.dist.df %>% ggplot(aes(x = dist, y = snvs.dens)) +
geom_line(color = "#E41F1A") + xlim(-5000, 5000) +
xlab("Distance from origin (bp)") + ylab("Mutations per Mb") +
theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"))
cluster.1.snvs.dist.plot

pdf("./Rplots/cluster.1.snvs.dist.plot.pdf", width=7, height=6, useDingbats=FALSE)
cluster.1.snvs.dist.plot
dev.off()
## quartz_off_screen
## 2
Assess the correlation between XR-seq signal and mutations at origins
in cluster 1 genomes
# Recover origin-associated snvs in cluster 1
cluster.1.snvs.10kb.df <- read.table("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/cluster.1.snvs.10kb.bed", sep = "\t")
colnames(cluster.1.snvs.10kb.df) <- c("seqnames", "start", "end", "REF", "ALT", "seqnames.ori", "start.ori", "end.ori", "EFF", "ori.id", "ori.width", "dist")
cluster.1.snvs.ori.summary.df <- cluster.1.snvs.10kb.df %>% filter(dist >= -2000 & dist <= 2000) %>% group_by(ori.id) %>% summarise(snvs.count = n())
# Compute averaged XRseq values
# For CPD
XRseq.CPD.plus.ori.values <- rowMeans(as.data.frame(XRseq.CPD.plus.ori.mat)[,95:105], na.rm = T)
XRseq.CPD.minus.ori.values <- rowMeans(as.data.frame(XRseq.CPD.minus.ori.mat)[,95:105], na.rm = T)
XRseq.CPD.ori.df <- cbind.data.frame(ori.id = ori.gr$ori.id, CPD.plus = XRseq.CPD.plus.ori.values, CPD.minus = XRseq.CPD.minus.ori.values) %>%
mutate(CPD = CPD.plus + CPD.minus) %>% left_join(cluster.1.snvs.ori.summary.df, by = "ori.id") %>%
mutate(bin = ntile(CPD, 20)) %>% group_by(bin) %>% summarise(read = mean(CPD, na.rm = T), snvs.mean = mean(snvs.count, na.rm = T), snvs.sem = std.error(snvs.count, na.rm = T)) %>% mutate(exp = "CPD")
# For (6-4)PP
XRseq.64PP.plus.ori.values <- rowMeans(as.data.frame(XRseq.64PP.plus.ori.mat)[,95:105], na.rm = T)
XRseq.64PP.minus.ori.values <- rowMeans(as.data.frame(XRseq.64PP.minus.ori.mat)[,95:105], na.rm = T)
XRseq.64PP.ori.df <- cbind.data.frame(ori.id = ori.gr$ori.id, PP.plus = XRseq.64PP.plus.ori.values, PP.minus = XRseq.64PP.minus.ori.values) %>%
mutate(PP = PP.plus + PP.minus + 1) %>% left_join(cluster.1.snvs.ori.summary.df, by = "ori.id") %>%
mutate(bin = ntile(PP, 20)) %>% group_by(bin) %>% summarise(read = mean(PP, na.rm = T), snvs.mean = mean(snvs.count, na.rm = T), snvs.sem = std.error(snvs.count, na.rm = T)) %>% mutate(exp = "64PP")
# Save data for plots
XRseq.CPD.64PP.ori.summary.df <- rbind(XRseq.CPD.ori.df, XRseq.64PP.ori.df)
saveRDS(XRseq.CPD.64PP.ori.summary.df, "./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.64PP.ori.summary.df.rds")
# Compute stats
XRseq.CPD.ori.values.df <- cbind.data.frame(ori.id = ori.gr$ori.id, CPD.plus = XRseq.CPD.plus.ori.values, CPD.minus = XRseq.CPD.minus.ori.values) %>%
mutate(CPD = CPD.plus + CPD.minus) %>% left_join(cluster.1.snvs.ori.summary.df, by = "ori.id") %>% mutate(log10.CPD = log10(CPD)) %>% filter(log10.CPD != "Inf" & log10.CPD != "-Inf")
cor.test(XRseq.CPD.ori.values.df$log10.CPD, XRseq.CPD.ori.values.df$snvs.count) # Rho = -0.4411795, p-value < 2.2e-16
XRseq.64PP.ori.values.df <- cbind.data.frame(ori.id = ori.gr$ori.id, PP.plus = XRseq.64PP.plus.ori.values, PP.minus = XRseq.64PP.minus.ori.values) %>%
mutate(PP = PP.plus + PP.minus) %>% left_join(cluster.1.snvs.ori.summary.df, by = "ori.id") %>% mutate(log10.64PP = log10(PP)) %>% filter(log10.64PP != "Inf" & log10.64PP != "-Inf")
cor.test(XRseq.64PP.ori.values.df$log10.64PP, XRseq.64PP.ori.values.df$snvs.count) # Rho = -0.3659423, p-value < 2.2e-16
library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")
XRseq.CPD.64PP.ori.summary.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.64PP.ori.summary.df.rds")
XRseq.CPD.64PP.ori.summary.plot <- XRseq.CPD.64PP.ori.summary.df %>%
ggplot(aes(x=read, y=snvs.mean, color = exp)) +
geom_line() +
geom_errorbar(aes(ymin=snvs.mean-snvs.sem, ymax=snvs.mean+snvs.sem), width=.01, position=position_dodge(0.05)) +
geom_point(shape = 21, size = 2, fill = "white") +
scale_color_manual(values = c("#E41F1A", "#0474B5")) +
scale_x_continuous(trans = log10_trans(), breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) +
ylab("Mean mutations per origin") + xlab("CPD XR-seq, binned by read coverage") + ggtitle("Rho-CPD = -0.945, Rho-PP = -0.909, p-value < 2.2e-16") +
theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"))
XRseq.CPD.64PP.ori.summary.plot

pdf("./Rplots/XRseq.ori.snvs.plot.pdf", width=7, height=6, useDingbats=FALSE)
XRseq.CPD.64PP.ori.summary.plot
dev.off()
## quartz_off_screen
## 2
The previous experiments suggest a strand bias for NER activity at
origins. We assess herein whether this due to dinucleotide biases or due
to the single strandeness nature of the lagging strand.
XR-seq data have been generated using the MC-062 and 64M-2 antibodies
for the CPD and 64PP data respectively. The MC-062 has been raised
against thymidine dimers, then CPD XR-seq data are corrected for TT
frequency. The 64M-2 antobodies have been raised against UV-C irradiated
DNA, in this condition 64-PP adduct are formed at CC, CT and TC
dinucleodites. The 64PP XR-seq is then corrected for the frequencies of
these dinucleotides.
Relevant websites: - for CPD: https://www.kamiyabiomedical.com/cgi-bin/search.cgi?search=MC-062&language=english
- for 64PP: https://www.cosmobiousa.com/products/anti-6-4pps-mab-clone-64m-2
Extract dinucleotide frequencies.
# r
# Extract dinucleotide frequency
# Load domain/bin coordinates
ori.10kb.domain.100nt.split.gr <- import("./Dataset/ori/ori.10kb.domain.100nt.split.hg19.NCBI.bed")
my.chr <- c(1:22, "X", "Y")
ori.10kb.domain.100nt.split.gr <- ori.10kb.domain.100nt.split.gr[seqnames(ori.10kb.domain.100nt.split.gr) %in% my.chr]
seqlevelsStyle(ori.10kb.domain.100nt.split.gr) <- "UCSC"
# Recover sequences and compute base composition statistics
ori.views <- Views(Hsapiens, ori.10kb.domain.100nt.split.gr)
# dinuc <- (expand.grid(c("A", "C", "G", "T"), c("A", "C", "G", "T")) %>% mutate(dinuc = paste0(Var1, Var2)))$dinuc
ori.dinuc <- dinucleotideFrequency(ori.views, step=1, as.prob = FALSE)
ori.dinuc.df <- cbind.data.frame(bin = ori.10kb.domain.100nt.split.gr$name, ori.dinuc)
ori.dinuc.summary <- ori.dinuc.df %>% group_by(bin) %>% summarise_at(colnames(ori.dinuc.df)[-1], sum) %>%
mutate(dinuc.sum = rowSums(across(where(is.numeric)))) %>%
mutate(dist = (as.numeric(bin)-100)*100,
TT.freq = TT/dinuc.sum,
AA.freq = AA/dinuc.sum,
CC.freq = CC/dinuc.sum,
GG.freq = GG/dinuc.sum,
CT.freq = CT/dinuc.sum,
AG.freq = AG/dinuc.sum,
TC.freq = TC/dinuc.sum,
GA.freq = GA/dinuc.sum)
write.csv(ori.dinuc.summary, "./Dataset/ori/ori.dinuc.summary.csv", quote = F, row.names = F)
Correct XR-seq signal
#r
# Correction
# for CPD
XRseq.CPD.ori.corr.df <- XRseq.CPD.ori.df %>% left_join(ori.dinuc.summary, by = "dist") %>%
mutate(value.corr = case_when(strand == "plus" ~ value/TT.freq, T ~ value/AA.freq))
# for 64PP
XRseq.64PP.ori.corr.df <- XRseq.64PP.ori.df %>% left_join(ori.dinuc.summary, by = "dist") %>%
mutate(value.corr = case_when(strand == "plus" ~ value/(CC.freq+CT.freq+TC.freq), T ~ value/(GG.freq+AG.freq+GA.freq)))
# for CPD
XRseq.CPD.plus.df <- XRseq.CPD.ori.corr.df %>% filter(strand == "plus") %>% dplyr::select(dist, plus = value.corr)
XRseq.CPD.minus.df <- XRseq.CPD.ori.corr.df %>% filter(strand == "minus") %>% dplyr::select(dist, minus = value.corr)
XRseq.CPD.strand.df <- XRseq.CPD.plus.df %>% left_join(XRseq.CPD.minus.df, by = "dist") %>% mutate(bias = plus / minus, class ="CPD")
# for 64PP
XRseq.64PP.plus.df <- XRseq.64PP.ori.corr.df %>% filter(strand == "plus") %>% dplyr::select(dist, plus = value.corr)
XRseq.64PP.minus.df <- XRseq.64PP.ori.corr.df %>% filter(strand == "minus") %>% dplyr::select(dist, minus = value.corr)
XRseq.64PP.strand.df <- XRseq.64PP.plus.df %>% left_join(XRseq.64PP.minus.df, by = "dist") %>% mutate(bias = plus / minus, class ="64PP")
# Combine
XRseq.strand.summary.df <- rbind(XRseq.CPD.strand.df, XRseq.64PP.strand.df)
saveRDS(XRseq.strand.summary.df, "./002_Mut_signatures_Pan_cancer/rds/XRseq.strand.summary.df.rds")
Plot
library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load data
XRseq.strand.summary.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/XRseq.strand.summary.df.rds")
# Plot
XRseq.strand.summary.plot <- XRseq.strand.summary.df %>% ggplot(aes(x = dist, y = bias, color = class)) +
geom_line() + xlim(-5000, 5000) +
scale_color_manual(values = c("#E41F1A", "#0474B5")) +
xlab("Distance from origin (bp)") + ylab("NER strand bias (plus / minus)") +
geom_hline(yintercept=1, linetype="dashed") +
theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"))
XRseq.strand.summary.plot

pdf("./Rplots/XRseq.strand.bias.plot.pdf", width=7, height=6, useDingbats=FALSE)
XRseq.strand.summary.plot
dev.off()
## quartz_off_screen
## 2
Cluster 1 SBS strand
asymetry
XR-seq profiles at origins suggest that a strand asymetry for
exposure to cluster 1 SBS
We assess this point by fitting the corresponding signature to
pyrimidine and purine mutations separately
# Load cluster 1 snvs gr list
cluster.1.snvs.grl <- readRDS("./002_Mut_signatures_Pan_cancer/sig/cluster.1.snvs.grl.rds")
# Split pyrimidine and purine mutations
cluster.1.pyr.grl <- cluster.1.snvs.grl
cluster.1.pur.grl <- cluster.1.snvs.grl
for (i in 1:length(cluster.1.snvs.grl)) {
print(i/length(cluster.1.snvs.grl))
gr.i <- cluster.1.snvs.grl[i]
bin.i <- names(gr.i)
unlist.gr.i <- unlist(gr.i)
pyr.gr.i <- unlist.gr.i[as.character(unlist.gr.i$REF) == "C" | as.character(unlist.gr.i$REF) == "T"]
pyr.gr.comp.i <- GRangesList(pyr.gr.i, compress=TRUE)
cluster.1.pyr.grl[i] <- pyr.gr.comp.i
pur.gr.i <- unlist.gr.i[as.character(unlist.gr.i$REF) == "G" | as.character(unlist.gr.i$REF) == "A"]
pur.gr.comp.i <- GRangesList(pur.gr.i, compress=TRUE)
cluster.1.pur.grl[i] <- pur.gr.comp.i
}
saveRDS(cluster.1.pyr.grl, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.1.pyr.grl.rds")
saveRDS(cluster.1.pur.grl, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.1.pur.grl.rds")
# Compute mutational matrices
cluster.1.pyr.mat <- mut_matrix(vcf_list = cluster.1.pyr.grl, ref_genome = ref.genome)
cluster.1.pur.mat <- mut_matrix(vcf_list = cluster.1.pur.grl, ref_genome = ref.genome)
# Fit signatures
cluster.1.pyr.fit <- fit_to_signatures(cluster.1.pyr.mat, snvs.ori.mut.cluster)
cluster.1.pur.fit <- fit_to_signatures(cluster.1.pur.mat, snvs.ori.mut.cluster)
# Format
cluster.1.pyr.fit.df <- as.data.frame(cluster.1.pyr.fit$contribution)
cluster.1.pyr.fit.df <- as.data.frame(t(cluster.1.pyr.fit.df)) %>% add_rownames(var = "dist") %>% select(dist, value = cluster.1) %>% mutate(strand = "plus")
cluster.1.pur.fit.df <- as.data.frame(cluster.1.pur.fit$contribution)
cluster.1.pur.fit.df <- as.data.frame(t(cluster.1.pur.fit.df)) %>% add_rownames(var = "dist") %>% select(dist, value = cluster.1) %>% mutate(strand = "minus")
cluster.1.strand.fit.df <- rbind(cluster.1.pyr.fit.df, cluster.1.pur.fit.df)
cluster.1.strand.fit.df$dist <- as.numeric(cluster.1.strand.fit.df$dist)
saveRDS(cluster.1.strand.fit.df, file = "./002_Mut_signatures_Pan_cancer/rds/cluster.1.strand.fit.df.rds")
Plot
library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")
cluster.1.strand.fit.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/cluster.1.strand.fit.df.rds")
cluster.1.strand.fit.plot <- cluster.1.strand.fit.df %>% ggplot(aes(x = dist, y = value, color = strand)) +
geom_line() + xlim(-5000, 5000) +
scale_color_manual(values = c("#0474B5", "#E41F1A")) +
xlab("Distance from origin (bp)") + ylab("Absolute SBS\ncluster 1 contribution") +
theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"))
cluster.1.strand.fit.plot

pdf("./Rplots/cluster.1.strand.fit.plot.pdf", width=7, height=6, useDingbats=FALSE)
cluster.1.strand.fit.plot
dev.off()
## quartz_off_screen
## 2
NER efficiency at
TFBS
ENCODE clustered transcription factor binding sites from UCSC http://hgdownload.soe.ucsc.edu/goldenPath/hg19/encRegTfbsClustered/
# R
TF.data.df <- read.table("./Dataset/UCSC/encRegTfbsClusteredWithCells.hg19.bed.gz")
colnames(TF.data.df) <- c("seqnames", "start", "end", "TF", "col", "cell.line")
# Use data from GM23338, skin fibroblast
all.TF.GM23338.data.df <- TF.data.df %>%
filter(cell.line %in% cell.line[grepl("GM23338", cell.line)]) # 123255 TFBS
all.TF.GM23338.data.gr <- makeGRangesFromDataFrame(all.TF.GM23338.data.df, keep.extra.columns = T)
unique(all.TF.GM23338.data.df$TF) # 5 TFs: CTCF, REST, EZH2, ETS1, NANOG
# Filter specific TFs
# We focus on ETS1 and CTCF
ETS1.GM23338.data.df <- TF.data.df %>%
filter(TF == "ETS1") %>%
filter(cell.line %in% cell.line[grepl("GM23338", cell.line)]) # 8856 TFBS
ETS1.GM23338.data.gr <- makeGRangesFromDataFrame(ETS1.GM23338.data.df, keep.extra.columns = T)
CTCF.GM23338.data.df <- TF.data.df %>%
filter(TF == "CTCF") %>%
filter(cell.line %in% cell.line[grepl("GM23338", cell.line)]) # 90599 TFBS
CTCF.GM23338.data.gr <- makeGRangesFromDataFrame(CTCF.GM23338.data.df, keep.extra.columns = T)
################################################################################
# Assess overlap with constitutive origins
ori.df <- read.table("./Dataset/ori/ori.inter.hg19.NCBI.bed")
colnames(ori.df) <- c("seqnames", "start", "end", "EFF", "ori.id", "ori.width")
ori.df <- ori.df %>% mutate(start = round(start-(ori.width/2)), end = round(start+(ori.width/2)))
ori.gr <- makeGRangesFromDataFrame(ori.df, keep.extra.columns = T)
seqlevelsStyle(ori.gr) <- "UCSC"
# 9341 origins
ori.TF.overlap <- findOverlaps(ori.gr, all.TF.GM23338.data.gr, minoverlap = 100L)
ori.no.TF.id <- unique(ori.gr[-queryHits(ori.TF.overlap)])$ori.id # 4377
ori.TF.id <- unique(ori.gr[queryHits(ori.TF.overlap)])$ori.id # 4964
ori.ETS1.overlap <- findOverlaps(ori.gr, ETS1.GM23338.data.gr, minoverlap = 100L)
ori.no.ETS1.id <- unique(ori.gr[-queryHits(ori.ETS1.overlap)])$ori.id # 7596
ori.ETS1.id <- unique(ori.gr[queryHits(ori.ETS1.overlap)])$ori.id # 1745
ori.CTCF.overlap <- findOverlaps(ori.gr, CTCF.GM23338.data.gr, minoverlap = 100L)
ori.no.CTCF.id <- unique(ori.gr[-queryHits(ori.CTCF.overlap)])$ori.id # 6268
ori.CTCF.id <- unique(ori.gr[queryHits(ori.CTCF.overlap)])$ori.id # 3073
############################################################################
# Assess NER activity at TFBS
# Compute TFBS midpoints
all.TF.GM23338.midpoints.df <- all.TF.GM23338.data.df %>%
mutate(start = round((start+end)/2), end = start + 1)
all.TF.GM23338.midpoints.gr <- makeGRangesFromDataFrame(all.TF.GM23338.midpoints.df) # 123255 ranges
ETS1.GM23338.midpoints.df <- ETS1.GM23338.data.df %>%
mutate(start = round((start+end)/2), end = start + 1)
ETS1.GM23338.midpoints.gr <- makeGRangesFromDataFrame(ETS1.GM23338.midpoints.df) # 8856 ranges
CTCF.GM23338.midpoints.df <- CTCF.GM23338.data.df %>%
mutate(start = round((start+end)/2), end = start + 1)
CTCF.GM23338.midpoints.gr <- makeGRangesFromDataFrame(CTCF.GM23338.midpoints.df) # 90599 ranges
# Load XR-seq data
XRseq.CPD.plus.bw <- import.bw("./Dataset/Hu_2015/BW/GSE67941_CSBCPD_Merged_PLUS_UNIQUE_NORM_fixedStep_25.bw")
XRseq.CPD.minus.bw <- import.bw("./Dataset/Hu_2015/BW/GSE67941_CSBCPD_Merged_MINUS_UNIQUE_NORM_fixedStep_25.bw")
# Compute NER activity at TFBS clusters
# For CPD
XRseq.CPD.plus.all.TF.mat <- normalizeToMatrix(XRseq.CPD.plus.bw, all.TF.GM23338.midpoints.gr, value_column = "score", mean_mode = "coverage", extend = 1000, w = 10, background = 0)
saveRDS(XRseq.CPD.plus.all.TF.mat, "./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.plus.all.TF.mat.rds")
XRseq.CPD.minus.all.TF.mat <- normalizeToMatrix(XRseq.CPD.minus.bw, all.TF.GM23338.midpoints.gr, value_column = "score", mean_mode = "coverage", extend = 1000, w = 10, background = 0)
saveRDS(XRseq.CPD.minus.all.TF.mat, "./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.minus.all.TF.mat.rds")
XRseq.CPD.plus.ETS1.mat <- normalizeToMatrix(XRseq.CPD.plus.bw, ETS1.GM23338.midpoints.gr, value_column = "score", mean_mode = "coverage", extend = 1000, w = 10, background = 0)
saveRDS(XRseq.CPD.plus.ETS1.mat, "./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.plus.ETS1.mat.rds")
XRseq.CPD.minus.ETS1.mat <- normalizeToMatrix(XRseq.CPD.minus.bw, ETS1.GM23338.midpoints.gr, value_column = "score", mean_mode = "coverage", extend = 1000, w = 10, background = 0)
saveRDS(XRseq.CPD.minus.ETS1.mat, "./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.minus.ETS1.mat.rds")
XRseq.CPD.plus.CTCF.mat <- normalizeToMatrix(XRseq.CPD.plus.bw, CTCF.GM23338.midpoints.gr, value_column = "score", mean_mode = "coverage", extend = 1000, w = 10, background = 0)
saveRDS(XRseq.CPD.plus.CTCF.mat, "./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.plus.CTCF.mat.rds")
XRseq.CPD.minus.CTCF.mat <- normalizeToMatrix(XRseq.CPD.minus.bw, CTCF.GM23338.midpoints.gr, value_column = "score", mean_mode = "coverage", extend = 1000, w = 10, background = 0)
saveRDS(XRseq.CPD.minus.CTCF.mat, "./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.minus.CTCF.mat.rds")
# Combine information from plus and minus strand, prepare df for plotting
XRseq.CPD.all.TF.values <- colMeans(as.data.frame(XRseq.CPD.plus.all.TF.mat))+colMeans(as.data.frame(XRseq.CPD.minus.all.TF.mat))
XRseq.CPD.ETS1.values <- colMeans(as.data.frame(XRseq.CPD.plus.ETS1.mat))+colMeans(as.data.frame(XRseq.CPD.minus.ETS1.mat))
XRseq.CPD.CTCF.values <- colMeans(as.data.frame(XRseq.CPD.plus.CTCF.mat))+colMeans(as.data.frame(XRseq.CPD.minus.CTCF.mat))
XRseq.CPD.TF.df <- cbind.data.frame(dist = seq(-1000,1000,10), all.TF = XRseq.CPD.all.TF.values, ETS1 = XRseq.CPD.ETS1.values, CTCF = XRseq.CPD.CTCF.values) %>%
gather("overlap", "value", -dist)
saveRDS(XRseq.CPD.TF.df, "./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.TF.df.rds")
############################################################################
# Assess mutation rates at TFBS
# Load cluster 1 snvs coordinates
cluster.1.grl <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/cluster.1.grl.rds")
cluster.1.gr <- unlist(cluster.1.grl)
# Compute distances
snvs.all.TF.nearest <- nearest(cluster.1.gr, all.TF.GM23338.midpoints.gr)
snvs.all.TF.nearest[is.na(snvs.all.TF.nearest)] <- 1 # Arbitrarily assign pos 1 to NA values
snvs.pos <- as.data.frame(ranges(cluster.1.gr))$start
all.TF.pos <- all.TF.GM23338.midpoints.gr[snvs.all.TF.nearest]
snvs.all.TF.nearest.df <- cbind.data.frame(snvs.pos, all.TF.pos) %>% mutate(dist = snvs.pos - start) %>%
filter(dist >= -5000 & dist <= 5000) %>% mutate(dist = (as.numeric(cut(dist, breaks = seq(-5000, 5000, 10)))-500)*10) %>%
group_by(dist) %>% summarise(all.TF = n())
plot(snvs.all.TF.nearest.df$dist, snvs.all.TF.nearest.df$all.TF, xlim = c(-500, 500), ylim = c(2100, 2900))
snvs.ETS1.nearest <- nearest(cluster.1.gr, ETS1.GM23338.midpoints.gr)
snvs.ETS1.nearest[is.na(snvs.ETS1.nearest)] <- 1 # Arbitrarily assign pos 1 to NA values
snvs.pos <- as.data.frame(ranges(cluster.1.gr))$start
ETS1.pos <- ETS1.GM23338.midpoints.gr[snvs.ETS1.nearest]
snvs.ETS1.nearest.df <- cbind.data.frame(snvs.pos, ETS1.pos) %>% mutate(dist = snvs.pos - start) %>%
filter(dist >= -5000 & dist <= 5000) %>% mutate(dist = (as.numeric(cut(dist, breaks = seq(-5000, 5000, 10)))-500)*10) %>%
group_by(dist) %>% summarise(ETS1 = n())
plot(snvs.ETS1.nearest.df$dist, snvs.ETS1.nearest.df$ETS1, xlim = c(-500, 500), ylim = c(0, 400))
snvs.CTCF.nearest <- nearest(cluster.1.gr, CTCF.GM23338.midpoints.gr)
snvs.CTCF.nearest[is.na(snvs.CTCF.nearest)] <- 1 # Arbitrarily assign pos 1 to NA values
snvs.pos <- as.data.frame(ranges(cluster.1.gr))$start
CTCF.pos <- CTCF.GM23338.midpoints.gr[snvs.CTCF.nearest]
snvs.CTCF.nearest.df <- cbind.data.frame(snvs.pos, CTCF.pos) %>% mutate(dist = snvs.pos - start) %>%
filter(dist >= -5000 & dist <= 5000) %>% mutate(dist = (as.numeric(cut(dist, breaks = seq(-5000, 5000, 10)))-500)*10) %>%
group_by(dist) %>% summarise(CTCF = n())
plot(snvs.CTCF.nearest.df$dist, snvs.CTCF.nearest.df$CTCF, xlim = c(-500, 500), ylim = c(1100, 2500))
snvs.count.TF.df <- snvs.all.TF.nearest.df %>% left_join(snvs.ETS1.nearest.df) %>% left_join(snvs.CTCF.nearest.df) %>%
gather("TF", "snvs.count", -dist)
saveRDS(snvs.count.TF.df, "./002_Mut_signatures_Pan_cancer/rds/snvs.count.TF.df.rds")
##########################################################################################################################################################################################
# Compute NER efficiency at origins binned by potential overlap with TFBS
# For CPD/CSB data
XRseq.CSB.CPD.ori.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/XRseq.CSB.CPD.ori.mat.rds")
XRseq.CSB.CPD.ori.df <- as.data.frame(XRseq.CSB.CPD.ori.mat)
XRseq.CSB.CPD.ori.df$ori.id <- ori.gr$ori.id
XRseq.CSB.CPD.no.TF.df <- XRseq.CSB.CPD.ori.df %>% filter(ori.id %in% ori.no.TF.id) %>% dplyr::select(-ori.id)
XRseq.CSB.CPD.ETS1.df <- XRseq.CSB.CPD.ori.df %>% filter(ori.id %in% ori.ETS1.id) %>% dplyr::select(-ori.id)
XRseq.CSB.CPD.CTCF.df <- XRseq.CSB.CPD.ori.df %>% filter(ori.id %in% ori.CTCF.id) %>% dplyr::select(-ori.id)
XRseq.CSB.CPD.all.TF.df <- XRseq.CSB.CPD.ori.df %>% filter(ori.id %in% ori.TF.id) %>% dplyr::select(-ori.id)
# Normalise values to background
XRseq.TF.summary.ori.df <- cbind.data.frame(dist = seq(-5000,5000,100), no.TF = as.vector(colMeans(XRseq.CSB.CPD.no.TF.df, na.rm = T)), ETS1 = as.vector(colMeans(XRseq.CSB.CPD.ETS1.df, na.rm = T)), CTCF = as.vector(colMeans(XRseq.CSB.CPD.CTCF.df, na.rm = T)), all.TF = as.vector(colMeans(XRseq.CSB.CPD.all.TF.df, na.rm = T))) %>%
mutate(no.TF = no.TF/mean(no.TF[c(1:10,90:100)]), ETS1 = ETS1/mean(ETS1[c(1:10,90:100)]), CTCF = CTCF/mean(CTCF[c(1:10,90:100)]), all.TF = all.TF/mean(all.TF[c(1:10,90:100)])) %>%
gather("overlap", "value", -dist)
saveRDS(XRseq.TF.summary.ori.df, file = "./002_Mut_signatures_Pan_cancer/rds/XRseq.TF.summary.ori.df.rds")
Plot
library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# XRseq profile at TFSB
XRseq.CPD.TF.df <- readRDS("~/Desktop/Projects/OriCan/002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.TF.df.rds")
XRseq.CPD.TF.plot <- XRseq.CPD.TF.df %>%
ggplot(aes(x=dist, y=value, colour=overlap)) + geom_line() +
scale_color_manual(values = c("#6D6E71", "#6FAB95", "#FABB65")) + xlim(-500, 500) +
xlab("Distance from TFBS (bp)") + ylab("NER efficiency (normalised read count)") +
theme_bw() + theme(aspect.ratio=1.0, panel.grid.minor = element_line(linetype = "dotted"))
XRseq.CPD.TF.plot

# snvs count at TFBS
snvs.count.TF.df <- readRDS("~/Desktop/Projects/OriCan/002_Mut_signatures_Pan_cancer/rds/snvs.count.TF.df.rds")
snvs.count.TF.plot <- snvs.count.TF.df %>%
filter(TF == "all.TF") %>%
ggplot(aes(x=dist, y=snvs.count, colour=TF)) + geom_line(color = "#E41F1A") +
xlim(-500, 500) + ylim(2100, 2900) +
xlab("Distance from TFBS (bp)") + ylab("snvs count") +
theme_bw() + theme(aspect.ratio=1.0, panel.grid.minor = element_line(linetype = "dotted"))
snvs.count.TF.plot

# Xrseq profile at origins with or without TFBS
XRseq.TF.summary.ori.df <- readRDS("~/Desktop/Projects/OriCan/002_Mut_signatures_Pan_cancer/rds/XRseq.TF.summary.ori.df.rds")
XRseq.TF.summary.ori.df.plot <- XRseq.TF.summary.ori.df %>%
filter(overlap != "all.TF") %>%
ggplot(aes(x=dist, y=value, colour=overlap)) + geom_line() +
scale_color_manual(values = c("#6FAB95", "#FABB65", "#0775B6")) +
xlab("Distance from origin (bp)") + ylab("NER efficiency (FC from background)") +
theme_bw() + theme(aspect.ratio=1.0, panel.grid.minor = element_line(linetype = "dotted"))
XRseq.TF.summary.ori.df.plot

# Prepare and save final plots
pdf("~/Desktop/Projects/OriCan/Manuscript/Nature Communications/Rplots/TFBS.plots.pdf", width=12, height=4, useDingbats=FALSE)
ggarrange(XRseq.CPD.TF.plot, snvs.count.TF.plot, XRseq.TF.summary.ori.df.plot, nrow = 1)
dev.off()
## quartz_off_screen
## 2
NER efficiency at
origins
Because NER can be coupled to transcription and that consitutive
origins significantly overlap with genes, we assess the impact of origin
position on XR-seq signal. We bin origins according to their genomic
location (genic, promoter or intergenic).
# r
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
# Load origin center coordinates
ori.df <- read.table("./Dataset/ori/ori.inter.hg19.NCBI.bed", sep="\t",)
colnames(ori.df) <- c("seqnames", "start", "end", "EFF", "ori.id", "ori.width")
ori.gr <- makeGRangesFromDataFrame(ori.df, keep.extra.columns = T)
seqlevelsStyle(ori.gr) <- "UCSC"
# Recover protein coding gene id
hsapiens.coding.genes <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/hsapiens.coding.genes.rds")
# Recover gene and promoter coordinates
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
hsapiens.genes.gr <- genes(txdb)
hsapiens.coding.genes.gr <- hsapiens.genes.gr[hsapiens.genes.gr$gene_id %in% hsapiens.coding.genes$ncbi.gene.id]
hsapiens.coding.TSS.gr <- promoters(hsapiens.coding.genes.gr, upstream=500, downstream=500)
# Select origins overlapping TSS or genes
ori.genic.gr <- subsetByOverlaps(ori.gr, hsapiens.coding.genes.gr)
ori.TSS.gr <- subsetByOverlaps(ori.gr, hsapiens.coding.TSS.gr)
# Summarise
ori.df <- ori.df %>% mutate(class = case_when(ori.df$ori.id %in% ori.genic.gr$ori.id ~ "genic",
ori.df$ori.id %in% ori.TSS.gr$ori.id ~ "promoter",
T ~ "intergenic"))
# Compute NER efficiency at origins binned by genomic coordinates
# For CPD/CSB data
XRseq.CPD.plus.ori.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.plus.ori.mat.rds")
XRseq.CPD.minus.ori.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/XRseq.CPD.minus.ori.mat.rds")
XRseq.CPD.plus.ori.mat.df <- as.data.frame(XRseq.CPD.plus.ori.mat)
XRseq.CPD.minus.ori.mat.df <- as.data.frame(XRseq.CPD.minus.ori.mat)
XRseq.CPD.plus.ori.mat.df$ori.id <- ori.gr$ori.id
XRseq.CPD.minus.ori.mat.df$ori.id <- ori.gr$ori.id
# Compute NER efficiency at origins binned by genomic coordinates
# For CPD/CSB data
XRseq.CSB.CPD.ori.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/XRseq.CSB.CPD.ori.mat.rds")
XRseq.CSB.CPD.ori.df <- as.data.frame(XRseq.CSB.CPD.ori.mat)
XRseq.CSB.CPD.ori.df$ori.id <- ori.gr$ori.id
XRseq.CSB.CPD.inter.ori.df <- XRseq.CSB.CPD.ori.df %>% filter(ori.id %in% (ori.df %>% filter(class == "intergenic"))$ori.id) %>% dplyr::select(-ori.id)
XRseq.CSB.CPD.genic.ori.df <- XRseq.CSB.CPD.ori.df %>% filter(ori.id %in% (ori.df %>% filter(class == "genic"))$ori.id) %>% dplyr::select(-ori.id)
XRseq.CSB.CPD.promoter.ori.df <- XRseq.CSB.CPD.ori.df %>% filter(ori.id %in% (ori.df %>% filter(class == "promoter"))$ori.id) %>% dplyr::select(-ori.id)
# Normalise values to background
XRseq.CSB.CPD.summary.ori.df <- cbind.data.frame(dist = seq(-5000,5000,100), inter = as.vector(colMeans(XRseq.CSB.CPD.inter.ori.df, na.rm = T)), genic = as.vector(colMeans(XRseq.CSB.CPD.genic.ori.df, na.rm = T)), promoter = as.vector(colMeans(XRseq.CSB.CPD.promoter.ori.df, na.rm = T))) %>%
mutate(inter = inter/mean(inter[c(1:10,90:100)]), genic = genic/mean(genic[c(1:10,90:100)]), promoter = promoter/mean(promoter[c(1:10,90:100)])) %>%
gather("location", "value", -dist)
saveRDS(XRseq.CSB.CPD.summary.ori.df, file = "./002_Mut_signatures_Pan_cancer/rds/XRseq.CSB.CPD.summary.ori.df.rds")
Plot
library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")
XRseq.CSB.CPD.summary.ori.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/XRseq.CSB.CPD.summary.ori.df.rds")
XRseq.CSB.CPD.summary.ori.plot <- XRseq.CSB.CPD.summary.ori.df %>%
ggplot(aes(x=dist, y=value, colour=location)) + geom_line() +
scale_color_manual(values = c("#4F4A75", "#0775B6", "#E4221E")) +
xlab("Distance from origin (bp)") + ylab("NER efficiency (FC from background)") +
theme_bw() + theme(aspect.ratio=1.0, panel.grid.minor = element_line(linetype = "dotted"))
XRseq.CSB.CPD.summary.ori.plot

pdf("./Rplots/XRseq.CSB.CPD.summary.ori.plot.pdf", width=7, height=6, useDingbats=FALSE)
XRseq.CSB.CPD.summary.ori.plot
dev.off()
## quartz_off_screen
## 2
Characterisition of
cluster 2 SBS signature
Cluster 2 signature present an unusual profile with enrichment of
mutation in the G[T>G]G context. In this section we aim to
characterise further the context of these most abundant T>G
mutations.
T>G mutation
context
We first combine all T>G mutations at origins and flanks to
extract a sequence logo
# r
# Assess expended mutational profile of cluster 2 SBS
# Select cluster 2 genomes
clusters.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/clusters.df.rds")
cluster.2.id <- (clusters.df %>% filter(cluster == 2))$donor.id # 42 genomes
snvs.ori.grl <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.ori.grl.rds")
snvs.flanks.ori.grl <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.flanks.ori.grl.rds")
snvs.ori.cluster.2.grl <- snvs.ori.grl[names(snvs.ori.grl) %in% cluster.2.id]
snvs.flanks.cluster.2.grl <- snvs.flanks.ori.grl[names(snvs.flanks.ori.grl) %in% cluster.2.id]
# Extract contexts to select the G[T>G]G mutation characterising cluster 2 SBS
# Load genome
ref.genome <- BSgenome.Hsapiens.UCSC.hg19
# At origins
snvs.ori.unlist.gr <- unlist(snvs.ori.cluster.2.grl) # 3,042 mutations
snvs.ori.mut.type <- mut_type(snvs.ori.unlist.gr)
snvs.ori.mut.context <- mut_context(snvs.ori.unlist.gr, ref_genome = ref.genome)
snvs.ori.mut.summary.df <- cbind.data.frame(seqnames = seqnames(snvs.ori.unlist.gr),
start = start(ranges(snvs.ori.unlist.gr)),
end = end(ranges(snvs.ori.unlist.gr)),
REF = snvs.ori.unlist.gr$REF,
mut.type = snvs.ori.mut.type,
mut.context = snvs.ori.mut.context)
snvs.ori.mut.G.TG.G.df <- snvs.ori.mut.summary.df %>% filter(mut.type == "T>G") %>%
mutate(strand = case_when(REF == "T" ~ "+", T ~ "-")) # 929 snvs
snvs.ori.mut.G.TG.G.gr <- makeGRangesFromDataFrame(snvs.ori.mut.G.TG.G.df, keep.extra.columns = T) + 10
snvs.ori.mut.G.TG.G.seq <- getSeq(Hsapiens, snvs.ori.mut.G.TG.G.gr)
# Build sequence logo
snvs.ori.mut.G.TG.G.seq.list <- as.vector(snvs.ori.mut.G.TG.G.seq)
saveRDS(snvs.ori.mut.G.TG.G.seq.list, file = "./002_Mut_signatures_Pan_cancer/sig/snvs.ori.mut.G.TG.G.seq.list.rds")
# At flanks
snvs.flanks.unlist.gr <- unlist(snvs.flanks.cluster.2.grl) # 34,173 mutations
snvs.flanks.mut.type <- mut_type(snvs.flanks.unlist.gr)
snvs.flanks.mut.context <- mut_context(snvs.flanks.unlist.gr, ref_genome = ref.genome)
snvs.flanks.mut.summary.df <- cbind.data.frame(seqnames = seqnames(snvs.flanks.unlist.gr),
start = start(ranges(snvs.flanks.unlist.gr)),
end = end(ranges(snvs.flanks.unlist.gr)),
REF = snvs.flanks.unlist.gr$REF,
mut.type = snvs.flanks.mut.type,
mut.context = snvs.flanks.mut.context)
snvs.flanks.mut.G.TG.G.df <- snvs.flanks.mut.summary.df %>% filter(mut.type == "T>G") %>%
mutate(strand = case_when(REF == "T" ~ "+", T ~ "-")) # 7,098 snvs
snvs.flanks.mut.G.TG.G.gr <- makeGRangesFromDataFrame(snvs.flanks.mut.G.TG.G.df, keep.extra.columns = T) + 10
snvs.flanks.mut.G.TG.G.seq <- getSeq(Hsapiens, snvs.flanks.mut.G.TG.G.gr)
# Build sequence list for logo
snvs.flanks.mut.G.TG.G.seq.list <- as.vector(snvs.flanks.mut.G.TG.G.seq)
saveRDS(snvs.flanks.mut.G.TG.G.seq.list, file = "./002_Mut_signatures_Pan_cancer/sig/snvs.flanks.mut.G.TG.G.seq.list.rds")
Plot DNA logos
library(dplyr)
library(ggplot2)
library(ggseqlogo)
library(ggpubr)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load data
snvs.ori.mut.G.TG.G.seq.list <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/snvs.ori.mut.G.TG.G.seq.list.rds")
snvs.flanks.mut.G.TG.G.seq.list <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/snvs.flanks.mut.G.TG.G.seq.list.rds")
logo.col <- make_col_scheme(chars=c('A', 'T', 'C', 'G'),cols=c('#ADCC55', '#DF1F18', '#33BAED', '#E6A004'))
logo.cluster.2.ori <-ggplot() + geom_logo(snvs.ori.mut.G.TG.G.seq.list, col_scheme=logo.col) + theme_logo() + theme(aspect.ratio=0.25) + ggtitle("origins")
logo.cluster.2.flanks <- ggplot() + geom_logo(snvs.flanks.mut.G.TG.G.seq.list, col_scheme=logo.col) + theme_logo() + theme_logo() + theme(aspect.ratio=0.25) + ggtitle("flanks")
ggarrange(logo.cluster.2.ori, logo.cluster.2.flanks, nrow = 2)

pdf("./Rplots/cluster.2.DNA.logo.pdf", width=5, height=5, useDingbats=FALSE)
ggarrange(logo.cluster.2.ori, logo.cluster.2.flanks, nrow = 2)
dev.off()
## quartz_off_screen
## 2
Extended T>G mutation context is analysed using a river plot
# r/SLURM
# We consider 5nt around T>G mutations context
snvs.ori.cluster.2.mut.mat.ext.context <- mut_matrix(snvs.ori.unlist.gr[start(ranges(snvs.ori.unlist.gr)) %in% snvs.ori.mut.G.TG.G.df$start], ref_genome = ref.genome, extension = 5)
colnames(snvs.ori.cluster.2.mut.mat.ext.context) <- c("ori")
snvs.flanks.cluster.2.mut.mat.ext.context <- mut_matrix(snvs.flanks.unlist.gr[start(ranges(snvs.flanks.unlist.gr)) %in% snvs.flanks.mut.G.TG.G.df$start], ref_genome = ref.genome, extension = 5)
colnames(snvs.flanks.cluster.2.mut.mat.ext.context) <- c("flanks")
snvs.cluster.2.mut.mat.ext.context <- cbind(snvs.ori.cluster.2.mut.mat.ext.context, snvs.flanks.cluster.2.mut.mat.ext.context)
saveRDS(snvs.cluster.2.mut.mat.ext.context, file = "./002_Mut_signatures_Pan_cancer/sig/snvs.cluster.2.mut.mat.ext.context.rds")
# As the generation of the river plot uses a lot of memory, the plot is generated externally
pdf("./Rplots/cluster.2.extended.profile.5nt.pdf", width=8, height=6, useDingbats=FALSE)
plot_river(snvs.cluster.2.mut.mat.ext.context) + theme(aspect.ratio=0.3)
dev.off()
Both DNA logos and the river plot suggest that T>G mutations occur
within G-quadruplex (G4) structures.
Cluster 2 cancer
sample specific signatures
# R
# Compute cancer type specific signature from tumour in cluster 2
# Load cluster information
clusters.df <- readRDS("~/Desktop/Projects/OriCan/002_Mut_signatures_Pan_cancer/rds/clusters.df.rds")
# Load mutational matrix
snvs.diff.mut.mat <- readRDS("~/Desktop/Projects/OriCan/002_Mut_signatures_Pan_cancer/rds/snvs.diff.mut.mat.rds")
# Sum mutation counts per clusters
cluster.2.id <- (clusters.df %>% filter(cluster == 2))$donor.id
# Add cancer type information
snvs.dens.ratio.df <- readRDS("~/Desktop/Projects/OriCan/001_Mut_density_Pan_cancer/rds/snvs.dens.ratio.df.rds")
sample.info.df <- snvs.dens.ratio.df %>% dplyr::select(donor, cancer.type, cancer.project, Primary.Site)
cluster.2.sample.info.df <- sample.info.df %>% filter(donor %in% cluster.2.id)
unique(cluster.2.sample.info.df$cancer.type)
# "PACA" "ESAD" "MELA" "PBCA" "OV" "BOCA" "CMDI" "BRCA"
# Compute mutational signatures for selected cancer type
# PACA
cluster.2.PACA.id <- cluster.2.sample.info.df %>% filter(cancer.type == "PACA")
cluster.2.PACA.mut <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.2.PACA.id$donor)) %>% mutate(PACA = rowSums(.)) %>% dplyr::select(PACA)
# ESAD
cluster.2.ESAD.id <- cluster.2.sample.info.df %>% filter(cancer.type == "ESAD")
cluster.2.ESAD.mut <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.2.ESAD.id$donor)) %>% mutate(ESAD = rowSums(.)) %>% dplyr::select(ESAD)
# MELA
cluster.2.MELA.id <- cluster.2.sample.info.df %>% filter(cancer.type == "MELA")
cluster.2.MELA.mut <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.2.MELA.id$donor)) %>% mutate(MELA = rowSums(.)) %>% dplyr::select(MELA)
# PBCA
cluster.2.PBCA.id <- cluster.2.sample.info.df %>% filter(cancer.type == "PBCA")
cluster.2.PBCA.mut <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.2.PBCA.id$donor)) %>% mutate(PBCA = rowSums(.)) %>% dplyr::select(PBCA)
# OV
cluster.2.OV.id <- cluster.2.sample.info.df %>% filter(cancer.type == "OV")
cluster.2.OV.mut <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.2.OV.id$donor)) %>% mutate(OV = rowSums(.)) %>% dplyr::select(OV)
# BOCA
cluster.2.BOCA.id <- cluster.2.sample.info.df %>% filter(cancer.type == "BOCA")
cluster.2.BOCA.mut <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.2.BOCA.id$donor)) %>% mutate(BOCA = rowSums(.)) %>% dplyr::select(BOCA)
# CMDI
cluster.2.CMDI.id <- cluster.2.sample.info.df %>% filter(cancer.type == "CMDI")
cluster.2.CMDI.mut <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.2.CMDI.id$donor)) %>% mutate(CMDI = rowSums(.)) %>% dplyr::select(CMDI)
# BRCA
cluster.2.BRCA.id <- cluster.2.sample.info.df %>% filter(cancer.type == "BRCA")
cluster.2.BRCA.mut <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(cluster.2.BRCA.id$donor)) %>% mutate(BRCA = rowSums(.)) %>% dplyr::select(BRCA)
# Combine all signatures
cluster.2.cancer.type.mut <- cbind.data.frame(cluster.2.PACA.mut, cluster.2.ESAD.mut, cluster.2.MELA.mut,
cluster.2.PBCA.mut, cluster.2.OV.mut, cluster.2.BOCA.mut,
cluster.2.CMDI.mut, cluster.2.BRCA.mut)
saveRDS(cluster.2.cancer.type.mut, file = "~/Desktop/Projects/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster.2.cancer.type.mut.rds")
Plot profile
library(dplyr)
library(ggplot2)
library(MutationalPatterns)
# Load data
cluster.2.cancer.type.mut <- readRDS(file = "~/Desktop/Projects/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster.2.cancer.type.mut.rds")
plot_96_profile(cluster.2.cancer.type.mut, condensed = TRUE, ymax = 0.45)

pdf("~/Desktop/Projects/OriCan/Manuscript/Nature Communications/Rplots/cluster.2.cancer.type.signatures.pdf", width=5, height=8, useDingbats=FALSE)
plot_96_profile(cluster.2.cancer.type.mut, condensed = TRUE, ymax = 0.45)
dev.off()
## quartz_off_screen
## 2
G-quadruplex
occurences at mutation sites
In this section we assess the contribution of G4s to cluster 2 SBS.
To quantify the ability of sequences to fold into G4 structures, we use
the G4Hunter algortithm (Bedrat et al. NAR 2016, https://academic.oup.com/nar/article/44/4/1746/1854457)
# Compute G4Hunter score of sequences encompassing T>G mutations (+- 25nt)
library(S4Vectors)
library(Biostrings)
source("./Scripts/scoreG4hunt.R")
# Compute G4H scores for T>G mutations at origins
snvs.ori.mut.G.TG.G.ext.gr <- makeGRangesFromDataFrame(snvs.ori.mut.G.TG.G.df, keep.extra.columns = T) + 25
snvs.ori.mut.G.TG.G.ext.seq <- getSeq(Hsapiens, snvs.ori.mut.G.TG.G.ext.gr)
snvs.ori.GT.G4H <- vector()
for (i in 1:length(snvs.ori.mut.G.TG.G.ext.seq)) {
print(i/length(snvs.ori.mut.G.TG.G.ext.seq))
seq.i <- snvs.ori.mut.G.TG.G.ext.seq[i]
G4H.i <- scoreG4hunt(seq.i)
snvs.ori.GT.G4H <- c(snvs.ori.GT.G4H, G4H.i)
}
snvs.ori.GT.G4H.df <- cbind.data.frame(data = "ori", G4H.score = snvs.ori.GT.G4H)
# Compute G4H scores for T>G mutations at origin flanks
snvs.flanks.mut.G.TG.G.ext.gr <- makeGRangesFromDataFrame(snvs.flanks.mut.G.TG.G.df, keep.extra.columns = T) + 25
snvs.flanks.mut.G.TG.G.ext.seq <- getSeq(Hsapiens, snvs.flanks.mut.G.TG.G.ext.gr)
snvs.flanks.GT.G4H <- vector()
for (i in 1:length(snvs.flanks.mut.G.TG.G.ext.seq)) {
print(i/length(snvs.flanks.mut.G.TG.G.ext.seq))
seq.i <- snvs.flanks.mut.G.TG.G.ext.seq[i]
G4H.i <- scoreG4hunt(seq.i)
snvs.flanks.GT.G4H <- c(snvs.flanks.GT.G4H, G4H.i)
}
snvs.flanks.GT.G4H.df <- cbind.data.frame(data = "flanks", G4H.score = snvs.flanks.GT.G4H)
# Compute G4H scores for all Ts at origins (center +- 500bp)
# Recover origin domain coordinates
ori.1kb.df <- read.table("./Dataset/ori/ori.1kb.domain.hg19.NCBI.bed", sep="\t")
colnames(ori.1kb.df) <- c("seqnames", "start", "end", "EFF", "ori.id", "ori.width")
ori.1kb.gr <- makeGRangesFromDataFrame(ori.1kb.df)
my.chr <- c(1:22, "X", "Y")
ori.1kb.gr <- ori.1kb.gr[seqnames(ori.1kb.gr) %in% my.chr]
seqlevelsStyle(ori.1kb.gr) <- "UCSC"
# Recover origin sequences
ori.1kb.seq <- getSeq(Hsapiens, ori.1kb.gr)
# Comute G4H score of all N25TN25 and N25AN25 sequences
library(stringr)
G4H.all.ori.Ts <- vector()
for(i in 1:length(ori.1kb.seq)) {
print(i/length(ori.1kb.seq))
seq.i <- ori.1kb.seq[i]
T.seq.i <- str_extract_all(as.character(seq.i), pattern = "[A|C|G|T]{25}T[A|C|G|T]{25}")
A.seq.i <- str_extract_all(as.character(seq.i), pattern = "[A|C|G|T]{25}A[A|C|G|T]{25}")
AT.seq.i <- c(T.seq.i[[1]], A.seq.i[[1]])
G4H.i <- vector()
for (j in 1 :length(AT.seq.i)) {
seq.j <- AT.seq.i[j]
G4H.j <- scoreG4hunt(seq.j)
G4H.i <- c(G4H.i, G4H.j)
}
G4H.all.ori.Ts <- c(G4H.all.ori.Ts, G4H.i)
}
G4H.all.ori.Ts.df <- cbind.data.frame(data = "all.Ts", G4H.score = G4H.all.ori.Ts)
# Combine information
snvs.GT.G4H.summary.df <- rbind(snvs.ori.GT.G4H.df, snvs.flanks.GT.G4H.df, G4H.all.ori.Ts.df)
saveRDS(snvs.GT.G4H.summary.df, file = "./002_Mut_signatures_Pan_cancer/rds/snvs.GT.G4H.summary.df.rds")
# Compute stats
ks.test(snvs.GT.G4H.summary.df[which(snvs.GT.G4H.summary.df$data == "ori"),]$G4H.score, snvs.GT.G4H.summary.df[which(snvs.GT.G4H.summary.df$data == "flanks"),]$G4H.score) # p-value < 2.2e-16
ks.test(snvs.GT.G4H.summary.df[which(snvs.GT.G4H.summary.df$data == "ori"),]$G4H.score, snvs.GT.G4H.summary.df[which(snvs.GT.G4H.summary.df$data == "all.Ts"),]$G4H.score) # p-value < 2.2e-16
library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load result
snvs.GT.G4H.summary.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/snvs.GT.G4H.summary.df.rds")
# Plot
snvs.GT.G4H.summary.plot <- snvs.GT.G4H.summary.df %>%
ggplot(aes(x=data, y=G4H.score, alpha = 0.3)) +
geom_boxplot(width = 0.5, fill="#D56114") + ylim(-3,5) +
geom_hline(yintercept=0, linetype="dashed") + ylab("G4H score") +
theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"), legend.position = "none", axis.title.x=element_blank(), axis.text.x=element_text(angle=45,hjust=1))
snvs.GT.G4H.summary.plot

pdf("./Rplots/snvs.GT.G4H.summary.plot.pdf", width=10, height=6, useDingbats=FALSE)
snvs.GT.G4H.summary.plot
dev.off()
## quartz_off_screen
## 2
Association between
G-quadruplex formation and mutational burden
This section aims to assess the correlation between G-quadruplex
formation and mutational burden at origins. To do so we recover all
G-quadruplex forming sequences (of the for
N5G3+N1-12G3+N1-12G3+N1-12G3+N5), assess their propensity to fold into
G-quadruplexes (G4H score) and their mutability (T>G mutations).
Compute G4 propensity
# r
# Recover all G4 forming sequences at origins and compute their associated G4H scores
ori.1kb.df <- read.table("./Dataset/ori/ori.1kb.domain.hg19.NCBI.bed", sep="\t")
colnames(ori.1kb.df) <- c("seqnames", "start", "end", "EFF", "ori.id", "ori.width")
ori.1kb.gr <- makeGRangesFromDataFrame(ori.1kb.df)
my.chr <- c(1:22, "X", "Y")
ori.1kb.gr <- ori.1kb.gr[seqnames(ori.1kb.gr) %in% my.chr]
seqlevelsStyle(ori.1kb.gr) <- "UCSC"
# Recover origin sequences
ori.1kb.seq <- getSeq(Hsapiens, ori.1kb.gr)
G4.score.ori.df <- tibble()
for (i in 1:length(ori.1kb.seq)) {
print(i/length(ori.1kb.seq))
seq.i <- as.character(ori.1kb.seq[i])
seqnames.i <- seqnames(ori.1kb.gr)[i]
ranges.i <- ranges(ori.1kb.gr)[i]
# Compute score of G4 motif on top strand
G4.seq.i <- str_match_all(seq.i, "(?=([ACGT]{4}[ACT][G]{3,}[ACGT]{1,12}[G]{3,}[ACGT]{1,12}[G]{3,}[ACGT]{1,12}[G]{3,}[ACT][ACGT]{4}))")[[1]][, 2]
G4.seq.length.i <- nchar(G4.seq.i)
if (length(G4.seq.i) > 0) {
G4H.score.i <- vector()
for (j in 1:length(G4.seq.i)) {
G4.seq.j <- DNAStringSet(G4.seq.i)[j]
G4H.score.j <- scoreG4hunt(G4.seq.j)
G4H.score.i <- c(G4H.score.i, G4H.score.j)
}
G4.seq.pos.i <- str_locate_all(seq.i, "(?=([ACGT]{4}[ACT][G]{3,}[ACGT]{1,12}[G]{3,}[ACGT]{1,12}[G]{3,}[ACGT]{1,12}[G]{3,}[ACT][ACGT]{4}))")[[1]]
G4.seq.start.i <- G4.seq.pos.i[,1]
G4.seq.end.i <- G4.seq.start.i+G4.seq.length.i
G4.ranges.start.i <- start(ranges.i)+G4.seq.start.i
G4.ranges.end.i <- start(ranges.i)+G4.seq.end.i
G4.summary.i <- cbind.data.frame(seqnames = seqnames.i, start = G4.ranges.start.i, end = G4.ranges.end.i, class = "G4", seq = G4.seq.i, score = G4H.score.i)
} else {G4.summary.i <- tibble()}
# Compute score of G4 motif on bottom strand
C4.seq.i <- str_match_all(seq.i, "(?=([ACGT]{4}[AGT][C]{3,}[ACGT]{1,12}[C]{3,}[ACGT]{1,12}[C]{3,}[ACGT]{1,12}[C]{3,}[AGT][ACGT]{4}))")[[1]][, 2]
C4.seq.length.i <- nchar(C4.seq.i)
if (length(C4.seq.i) > 0) {
C4H.score.i <- vector()
for (j in 1:length(C4.seq.i)) {
C4.seq.j <- DNAStringSet(C4.seq.i)[j]
C4H.score.j <- scoreG4hunt(reverseComplement(C4.seq.j))
C4H.score.i <- c(C4H.score.i, C4H.score.j)
}
C4.seq.pos.i <- str_locate_all(seq.i, "(?=([ACGT]{4}[AGT][C]{3,}[ACGT]{1,12}[C]{3,}[ACGT]{1,12}[C]{3,}[ACGT]{1,12}[C]{3,}[AGT][ACGT]{4}))")[[1]]
C4.seq.start.i <- C4.seq.pos.i[,1]
C4.seq.end.i <- C4.seq.start.i+C4.seq.length.i
C4.ranges.start.i <- start(ranges.i)+C4.seq.start.i
C4.ranges.end.i <- start(ranges.i)+C4.seq.end.i
C4.summary.i <- cbind.data.frame(seqnames = seqnames.i, start = C4.ranges.start.i, end = C4.ranges.end.i, class = "C4", seq = C4.seq.i, score = C4H.score.i)
} else {C4.summary.i <- tibble()}
G4.score.ori.df <- rbind(G4.score.ori.df, G4.summary.i, C4.summary.i)
}
saveRDS(G4.score.ori.df, file = "./002_Mut_signatures_Pan_cancer/rds/G4.score.ori.df.rds")
# Recover all G4 forming sequences at origin flanks and compute their associated G4H scores
ori.10kb.df <- read.table("./Dataset/ori/ori.10kb.domain.hg19.NCBI.bed", sep="\t")
colnames(ori.10kb.df) <- c("seqnames", "start", "end", "EFF", "ori.id", "ori.width")
ori.10kb.gr <- makeGRangesFromDataFrame(ori.10kb.df)
my.chr <- c(1:22, "X", "Y")
ori.10kb.gr <- ori.10kb.gr[seqnames(ori.10kb.gr) %in% my.chr]
seqlevelsStyle(ori.10kb.gr) <- "UCSC"
# Recover origin sequences
ori.10kb.seq <- getSeq(Hsapiens, ori.10kb.gr)
G4.score.flanks.df <- tibble()
for (i in 1:length(ori.10kb.seq)) {
print(i/length(ori.10kb.seq))
seq.i <- as.character(ori.10kb.seq[i])
seqnames.i <- seqnames(ori.1kb.gr)[i]
ranges.i <- ranges(ori.1kb.gr)[i]
# Compute score of G4 motif on top strand
G4.seq.i <- str_match_all(seq.i, "(?=([ACGT]{4}[ACT][G]{3,}[ACGT]{1,12}[G]{3,}[ACGT]{1,12}[G]{3,}[ACGT]{1,12}[G]{3,}[ACT][ACGT]{4}))")[[1]][, 2]
G4.seq.length.i <- nchar(G4.seq.i)
if (length(G4.seq.i) > 0) {
G4H.score.i <- vector()
for (j in 1:length(G4.seq.i)) {
G4.seq.j <- DNAStringSet(G4.seq.i)[j]
G4H.score.j <- scoreG4hunt(G4.seq.j)
G4H.score.i <- c(G4H.score.i, G4H.score.j)
}
G4.seq.pos.i <- str_locate_all(seq.i, "(?=([ACGT]{4}[ACT][G]{3,}[ACGT]{1,12}[G]{3,}[ACGT]{1,12}[G]{3,}[ACGT]{1,12}[G]{3,}[ACT][ACGT]{4}))")[[1]]
G4.seq.start.i <- G4.seq.pos.i[,1]
G4.seq.end.i <- G4.seq.start.i+G4.seq.length.i
G4.ranges.start.i <- start(ranges.i)+G4.seq.start.i
G4.ranges.end.i <- start(ranges.i)+G4.seq.end.i
G4.summary.i <- cbind.data.frame(seqnames = seqnames.i, start = G4.ranges.start.i, end = G4.ranges.end.i, class = "G4", seq = G4.seq.i, score = G4H.score.i)
} else {G4.summary.i <- tibble()}
# Compute score of G4 motif on bottom strand
C4.seq.i <- str_match_all(seq.i, "(?=([ACGT]{4}[AGT][C]{3,}[ACGT]{1,12}[C]{3,}[ACGT]{1,12}[C]{3,}[ACGT]{1,12}[C]{3,}[AGT][ACGT]{4}))")[[1]][, 2]
C4.seq.length.i <- nchar(C4.seq.i)
if (length(C4.seq.i) > 0) {
C4H.score.i <- vector()
for (j in 1:length(C4.seq.i)) {
C4.seq.j <- DNAStringSet(C4.seq.i)[j]
C4H.score.j <- scoreG4hunt(reverseComplement(C4.seq.j))
C4H.score.i <- c(C4H.score.i, C4H.score.j)
}
C4.seq.pos.i <- str_locate_all(seq.i, "(?=([ACGT]{4}[AGT][C]{3,}[ACGT]{1,12}[C]{3,}[ACGT]{1,12}[C]{3,}[ACGT]{1,12}[C]{3,}[AGT][ACGT]{4}))")[[1]]
C4.seq.start.i <- C4.seq.pos.i[,1]
C4.seq.end.i <- C4.seq.start.i+C4.seq.length.i
C4.ranges.start.i <- start(ranges.i)+C4.seq.start.i
C4.ranges.end.i <- start(ranges.i)+C4.seq.end.i
C4.summary.i <- cbind.data.frame(seqnames = seqnames.i, start = C4.ranges.start.i, end = C4.ranges.end.i, class = "C4", seq = C4.seq.i, score = C4H.score.i)
} else {C4.summary.i <- tibble()}
G4.score.flanks.df <- rbind(G4.score.flanks.df, G4.summary.i, C4.summary.i)
}
# Filter out G4 at origins
G4.score.flanks.gr <- makeGRangesFromDataFrame(G4.score.flanks.df, keep.extra.columns = T) # 181,147 G4s
G4.overlap <- findOverlaps(G4.score.flanks.gr, G4.score.ori.gr)
G4.score.flanks.filter.gr <- G4.score.flanks.gr[-queryHits(G4.overlap)] # 179,962 G4s
saveRDS(G4.score.flanks.filter.gr, file = "./002_Mut_signatures_Pan_cancer/rds/G4.score.flanks.filter.gr.rds")
Subset mutations overlapping G4 forming sequences
# r
# Load G4 information
G4.score.ori.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/G4.score.ori.df.rds")
G4.score.ori.gr <- makeGRangesFromDataFrame(G4.score.ori.df, keep.extra.columns = T) # 69,914 G4
G4.score.flanks.filter.gr <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/G4.score.flanks.filter.gr.rds")
# Load cluster 2 snvs information
snvs.ori.grl <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.ori.grl.rds")
snvs.flanks.ori.grl <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.flanks.ori.grl.rds")
clusters.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/clusters.df.rds")
cluster.2.id <- (clusters.df %>% filter(cluster == 2))$donor.id # 42 genomes
snvs.ori.cluster.2.grl <- snvs.ori.grl[names(snvs.ori.grl) %in% cluster.2.id]
# Format origins cluster 2 snvs
snvs.ori.cluster.2.gr <- unlist(snvs.ori.cluster.2.grl) # 3,042 mutations
snvs.ori.cluster.2.gr$mut <- paste(as.character(snvs.ori.cluster.2.gr$REF), as.character(unlist(snvs.ori.cluster.2.gr$ALT)), sep = ">")
ori.mut <- snvs.ori.cluster.2.gr$mut
ori.MUT <- case_when(ori.mut == "A>C" ~ "T>G",
ori.mut == "A>G" ~ "T>C",
ori.mut == "A>T" ~ "T>A",
ori.mut == "G>A" ~ "C>T",
ori.mut == "G>C" ~ "C>G",
ori.mut == "G>T" ~ "C>A",
T ~ ori.mut)
snvs.ori.cluster.2.gr$MUT <- ori.MUT
snvs.ori.cluster.2.TG.gr <- snvs.ori.cluster.2.gr[snvs.ori.cluster.2.gr$mut == "T>G"] # 498 snvs
snvs.ori.cluster.2.AC.gr <- snvs.ori.cluster.2.gr[snvs.ori.cluster.2.gr$mut == "A>C"] # 431 snvs
# Format flanks cluster 2 snvs
snvs.flanks.cluster.2.grl <- snvs.flanks.ori.grl[names(snvs.flanks.ori.grl) %in% cluster.2.id]
snvs.flanks.cluster.2.gr <- unlist(snvs.flanks.cluster.2.grl)
snvs.flanks.cluster.2.gr$mut <- paste(as.character(snvs.flanks.cluster.2.gr$REF), as.character(unlist(snvs.flanks.cluster.2.gr$ALT)), sep = ">")
flanks.mut <- snvs.flanks.cluster.2.gr$mut
flanks.MUT <- case_when(flanks.mut == "A>C" ~ "T>G",
flanks.mut == "A>G" ~ "T>C",
flanks.mut == "A>T" ~ "T>A",
flanks.mut == "G>A" ~ "C>T",
flanks.mut == "G>C" ~ "C>G",
flanks.mut == "G>T" ~ "C>A",
T ~ flanks.mut)
snvs.flanks.cluster.2.gr$MUT <- flanks.MUT
# Subset G4 at origins with T>G mutations
snvs.ori.cluster.2.TG.G4.gr <- subsetByOverlaps(G4.score.ori.gr[G4.score.ori.gr$class == "G4"], snvs.ori.cluster.2.TG.gr) # 670 G4s
snvs.ori.cluster.2.AC.G4.gr <- subsetByOverlaps(G4.score.ori.gr[G4.score.ori.gr$class == "C4"], snvs.ori.cluster.2.AC.gr) # 588 C4s
snvs.ori.cluster.2.mut.G4.gr <- c(snvs.ori.cluster.2.TG.G4.gr, snvs.ori.cluster.2.AC.G4.gr) # 1258 G4s
# Compare G4Hscore of mutated and non-mutated G4s at origins
G4.ori.summary.df <- cbind.data.frame(G4H.score = G4.score.ori.gr$score, class = "all.G4s")
G4.ori.mut.summary.df <- cbind.data.frame(G4H.score = snvs.ori.cluster.2.mut.G4.gr$score, class = "mut.G4s")
G4.ori.score.summary.df <- rbind(G4.ori.summary.df, G4.ori.mut.summary.df) # 71,172 origins
# Compute stats
ks.test(G4.ori.summary.df$G4H.score, G4.ori.mut.summary.df$G4H.score) # p-value < 2.2e-16
saveRDS(G4.ori.score.summary.df, file = "./002_Mut_signatures_Pan_cancer/rds/G4.ori.summary.df.rds")
# Compare G4Hscore of G4s at origins or origin flanks
G4.ori.score.df <- cbind.data.frame(class = "ori", G4H.score = G4.ori.dist.gr$score)
G4.flanks.score.df <- cbind.data.frame(class = "flanks", G4H.score = G4.flanks.dist.gr$score)
G4.ori.flanks.summary <- rbind(G4.ori.score.df, G4.flanks.score.df)
# Compute stats
ks.test(G4.ori.score.df$G4H.score, G4.flanks.score.df$G4H.score, alternative = "greater") # p-value = 0.09904
saveRDS(G4.ori.flanks.summary, file = "./002_Mut_signatures_Pan_cancer/rds/G4.ori.flanks.summary.rds")
# Subset mutations overlapping G4s at origins or flanks
snvs.G4.ori.cluster.2.gr <- subsetByOverlaps(snvs.ori.cluster.2.gr, G4.score.ori.gr) # 984 snvs
snvs.G4.flanks.cluster.2.gr <- subsetByOverlaps(snvs.flanks.cluster.2.gr, G4.score.flanks.filter.gr) # 533 G4s
# Compare mutation types at origins and flanks
# Prepare mutation frequency table
snvs.ori.table <- as.data.frame(table(snvs.G4.ori.cluster.2.gr$MUT))
colnames(snvs.ori.table) <- c("mut", "count")
snvs.ori.table <- snvs.ori.table %>% mutate(freq = count/sum(count), class = "ori")
snvs.flanks.table <- as.data.frame(table(snvs.G4.flanks.cluster.2.gr$MUT))
colnames(snvs.flanks.table) <- c("mut", "count")
snvs.flanks.table <- snvs.flanks.table %>% mutate(freq = count/sum(count), class = "flanks")
snvs.table.df <- rbind(snvs.ori.table, snvs.flanks.table)
saveRDS(snvs.table.df, file = "./002_Mut_signatures_Pan_cancer/rds/snvs.table.df.rds")
Plot results.
library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load results
G4.ori.score.summary.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/G4.ori.summary.df.rds")
G4.ori.flanks.summary <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/G4.ori.flanks.summary.rds")
snvs.table.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/snvs.table.df.rds")
# Plot 1
G4.ori.score.summary.plot <- G4.ori.score.summary.df %>%
ggplot(aes(x=class, y=G4H.score, alpha = 0.3)) +
geom_boxplot(width = 0.5, fill="#D56114") + ylim(-0.5,4.5) +
geom_hline(yintercept=0, linetype="dashed") + ylab("G4H score") +
theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"), legend.position = "none", axis.title.x=element_blank(), axis.text.x=element_text(angle=45,hjust=1))
G4.ori.score.summary.plot

pdf("./Rplots/G4.ori.score.summary.plot.pdf", width=10, height=6, useDingbats=FALSE)
G4.ori.score.summary.plot
dev.off()
## quartz_off_screen
## 2
# Plot 2
G4.ori.flanks.summary.plot <- G4.ori.flanks.summary %>%
ggplot(aes(x=class, y=G4H.score, alpha = 0.3)) +
geom_boxplot(width = 0.5, fill="#D56114") + ylim(-0.5,4.5) +
geom_hline(yintercept=0, linetype="dashed") + ylab("G4H score") +
theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"), legend.position = "none", axis.title.x=element_blank(), axis.text.x=element_text(angle=45,hjust=1))
G4.ori.flanks.summary.plot

pdf("./Rplots/G4.ori.flanks.summary.plot.pdf", width=10, height=6, useDingbats=FALSE)
G4.ori.flanks.summary.plot
dev.off()
## quartz_off_screen
## 2
# Plot 3
snvs.table.plot <- snvs.table.df %>% ggplot(aes(fill=mut, y=freq, x=class)) +
geom_bar(position="stack", stat="identity") + ylab("Frequency") +
scale_fill_manual(values = c("#33BAED", "#010101", "#DF1F18", "#D4D2D2", "#ADCC55", "#F1D1CF")) +
theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"), axis.title.x=element_blank(), axis.text.x=element_text(angle=45,hjust=1))
snvs.table.plot

# Compute odds ratio and associated p-value
fisher.df <- data.frame(
"yes" = c(484, 124),
"no" = c(500, 409),
row.names = c("ori", "flanks"),
stringsAsFactors = FALSE
)
colnames(fisher.df) <- c("T>G", "others")
test <- fisher.test(fisher.df) # Odds ratio 3.19, pval = 1.366678e-23
test$estimate
## odds ratio
## 3.190414
test$p.value
## [1] 1.366678e-23
pdf("./Rplots/snvs.table.plot.pdf", width=7, height=6, useDingbats=FALSE)
snvs.table.plot
dev.off()
## quartz_off_screen
## 2
Mutational burden at
experimentally mapped G4s
# R
library(S4Vectors)
library(Biostrings)
library(BSgenome.Hsapiens.UCSC.hg19)
source("./Scripts/scoreG4hunt.R")
source("./Scripts/G4_classification.R")
# BED files were shared by the authors of the original report
# Esnault et al. Nature Genetics volume 55, pages 1359–1369 (2023)
# We also used the authors script to extract the max G4H score of peak sequences
# Load G4access peak coordinates
G4access.HaCaT.hg19.df <- read.table("./Dataset/G4access/HaCaT_hg19_G4access.bed") %>% mutate(V5 = "HaCaT")
colnames(G4access.HaCaT.hg19.df) <- c("seqnames", "start", "end", "score", "cell.line")
G4access.K562.hg19.df <- read.table("./Dataset/G4access/K562_hg19_G4access.bed") %>% mutate(V5 = "K562")
colnames(G4access.K562.hg19.df) <- c("seqnames", "start", "end", "score", "cell.line")
G4access.Raji.hg19.df <- read.table("./Dataset/G4access/Raji_hg19_G4access.bed") %>% mutate(V5 = "Raji")
colnames(G4access.Raji.hg19.df) <- c("seqnames", "start", "end", "score", "cell.line")
# Combine and liftover
G4access.hg19.df <- rbind(G4access.HaCaT.hg19.df, G4access.K562.hg19.df, G4access.Raji.hg19.df)
G4access.hg19.gr <- makeGRangesFromDataFrame(G4access.hg19.df, keep.extra.columns = T) # 68192 peaks
hist(log10(width(G4access.hg19.gr)), breaks = 100)
# bimodal distribution with separation at 120 nt
# Recover G4 peaks at origins domain
ori.1kb.domain.bed <- read.table("./Dataset/ori/ori.1kb.domain.hg19.NCBI.bed")
colnames(ori.1kb.domain.bed) <- c("seqnames", "start", "end", "EFF", "ori.id")
ori.1kb.domain.gr <- makeGRangesFromDataFrame(ori.1kb.domain.bed, keep.extra.columns = T)
my.chr <- c(1:22, "X", "Y")
ori.1kb.domain.gr <- ori.1kb.domain.gr[seqnames(ori.1kb.domain.gr) %in% my.chr]
seqlevelsStyle(ori.1kb.domain.gr) <- "UCSC"
G4access.ori.gr <- subsetByOverlaps(G4access.hg19.gr, ori.1kb.domain.gr) # 12,809 peaks
# Add peak.id
peak.id <- paste0("G4access_", seq(1,length(G4access.ori.gr),1))
G4access.ori.gr$peak.id <- peak.id
# Recover peak sequences
G4access.ori.seq <- getSeq(Hsapiens, G4access.ori.gr)
names(G4access.ori.seq) <- G4access.ori.gr$peak.id
# Compute max G4H score and recover associated sequence - a rolling window of 25 nt is used
library(data.table)
library(parallel)
# Parallelisable inner function to compute G4H subsequence results
compute_subseq <- function(i, seq.j, name.j) {
subseq.i <- subseq(seq.j, i, i + 24)
G4H.subseq.i <- scoreG4hunt(subseq.i)
# Return a data.table with the relevant information
return(data.table(
peak.id = name.j,
G4.pos = i,
G4.seq = as.character(subseq.i),
G4H = G4H.subseq.i,
abs = G4H.subseq.i^2
))
}
# Main function to loop over sequences and compute max G4H for each
G4H.subseq.G4H.max.df <- data.table()
# Detect number of available cores
num_cores <- detectCores() - 1
# Loop through each sequence
for (j in seq_along(G4access.ori.seq)) {
print(j / length(G4access.ori.seq))
seq.j <- G4access.ori.seq[j]
name.j <- names(G4access.ori.seq)[j]
seq_width <- width(seq.j) - 25
# Use parallel computation to process subsequences
subseq_results <- mclapply(1:seq_width, compute_subseq, seq.j = seq.j, name.j = name.j, mc.cores = num_cores)
# Combine results into a single data.table
subseq_results <- rbindlist(subseq_results)
# Find the max G4H for this sequence and append to the final result
max_result <- subseq_results[which.max(abs), .(peak.id, G4.pos, G4.seq, G4H)]
G4H.subseq.G4H.max.df <- rbind(G4H.subseq.G4H.max.df, max_result)
}
saveRDS(G4H.subseq.G4H.max.df, "./002_Mut_signatures_Pan_cancer/rds/G4access.subseq.G4H.max.df.rds")
# Add G4H score to peak information
G4access.ori.G4H.gr <- as.data.frame(G4access.ori.gr) %>% left_join(G4H.subseq.G4H.max.df, by = "peak.id") %>% mutate(start = start + G4.pos, end = start + 25) %>% dplyr::select(-G4.pos) %>% GRanges()
saveRDS(G4access.ori.G4H.gr, "./002_Mut_signatures_Pan_cancer/rds/G4access.ori.G4H.gr.rds")
# Identify mutated G4 sequences
# Load cluster 2 snvs information
snvs.ori.grl <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.ori.grl.rds")
snvs.flanks.ori.grl <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.flanks.ori.grl.rds")
clusters.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/clusters.df.rds")
cluster.2.id <- (clusters.df %>% filter(cluster == 2))$donor.id # 42 genomes
snvs.ori.cluster.2.grl <- snvs.ori.grl[names(snvs.ori.grl) %in% cluster.2.id]
snvs.ori.cluster.2.gr <- unlist(snvs.ori.cluster.2.grl) # 3,042 snvs
G4access.ori.mutated.G4.gr <- subsetByOverlaps(G4access.ori.G4H.gr, snvs.ori.cluster.2.gr) # 179 G4s
ks.test(abs(G4access.ori.mutated.G4.gr$G4H), abs(G4access.ori.G4H.gr$G4H))
# p-value = 4.916e-09
# Prepare df
G4access.ori.mutated.G4.df <- as.data.frame(G4access.ori.mutated.G4.gr) %>% mutate(class = "mut.G4s")
G4access.ori.all.G4.df <- as.data.frame(G4access.ori.G4H.gr) %>% mutate(class = "all.G4s")
G4access.ori.G4.summary.df <- rbind(G4access.ori.mutated.G4.df, G4access.ori.all.G4.df)
# Mutated G4 are predicted to be more stable
# Indentify G4 subtypes within mutated and non mutated origin G4
mutated.G4.seq <- as.character(G4access.ori.mutated.G4.gr$G4.seq) # 179
non.mutated.G4.seq <- as.character(G4access.ori.G4H.gr$G4.seq) # 12809
mutated.G4.subtype.df <- G4.classification.func(mutated.G4.seq)
non.mutated.G4.subtype.df <- G4.classification.func(non.mutated.G4.seq)
# Compute stats and bind results
mutated.G4.subtype.summary.df <- mutated.G4.subtype.df %>% dplyr::select(motif, count.mutated = count) %>% mutate(freq.mutated = count.mutated/sum(count.mutated))
non.mutated.G4.subtype.summary.df <- non.mutated.G4.subtype.df %>% dplyr::select(motif, count.non.mutated = count) %>% mutate(freq.non.mutated = count.non.mutated/sum(count.non.mutated))
G4.subtype.summary.df <- mutated.G4.subtype.summary.df %>% left_join(non.mutated.G4.subtype.summary.df, by = "motif") %>% mutate(enr = log2(freq.mutated/freq.non.mutated))
# Mutated G4 are enriched for canonical (no bulges) short loop G4s
# Plot results
# Pies reporting the distribution of G4 subtypes
mutated.G4.subtype.pie <- mutated.G4.subtype.summary.df %>%
mutate(Motifs = fct_relevel(motif, rev(c("loop1-3", "loop4-5", "loop6-7", "long_loops", "simple_bulges", "2tetrads_cbulges", "others")))) %>%
ggplot(aes(x="", y=freq.mutated, fill=Motifs)) +
geom_bar(stat="identity", width=1) +
coord_polar("y", start=0) + ggtitle("Mutated G4s") +
scale_fill_manual(values = c("#D4D2D2", "#33BAED", "#08306B", "#ADCC54","#FFEDA0", "#FD8D3C", "#D6604D")) +
theme_void() + theme(aspect.ratio=1)
mutated.G4.subtype.pie
non.mutated.G4.subtype.pie <- non.mutated.G4.subtype.summary.df %>%
mutate(Motifs = fct_relevel(motif, rev(c("loop1-3", "loop4-5", "loop6-7", "long_loops", "simple_bulges", "2tetrads_cbulges", "others")))) %>%
ggplot(aes(x="", y=freq.non.mutated, fill=Motifs)) +
geom_bar(stat="identity", width=1) +
coord_polar("y", start=0) + ggtitle("All G4s") +
scale_fill_manual(values = c("#D4D2D2", "#33BAED", "#08306B", "#ADCC54","#FFEDA0", "#FD8D3C", "#D6604D")) +
theme_void() + theme(aspect.ratio=1)
non.mutated.G4.subtype.pie
# Plot reporting the enrichment of each subtypes
G4.subtype.summary.plot <- G4.subtype.summary.df %>%
mutate(Motif = fct_relevel(motif, c("loop1-3", "loop4-5", "loop6-7", "long_loops", "simple_bulges", "2tetrads_cbulges", "others"))) %>%
ggplot(aes(x=Motif, y=enr)) +
geom_bar(stat = "identity", fill="#D56114") + ylab("Motif enrichement (log2 FC)") +
geom_hline(yintercept=0, linetype="dashed") +
theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"), legend.position = "none", axis.title.x=element_blank(), axis.text.x=element_text(angle=45,hjust=1))
G4.subtype.summary.plot
# Save all plots
ggarrange(G4access.ori.G4.summary.plot, G4.subtype.summary.plot, mutated.G4.subtype.pie, non.mutated.G4.subtype.pie, nrow = 2, ncol = 2)
pdf("./Manuscript/Nature Communications/Rplots/G4access.plots.pdf", width=6, height=6, useDingbats=FALSE)
ggarrange(G4access.ori.G4.summary.plot, G4.subtype.summary.plot, mutated.G4.subtype.pie, non.mutated.G4.subtype.pie, nrow = 2, ncol = 2)
dev.off()
G-quadruplex SBS
signature
Extract the mutational profile of G-quadruplexes corrected for local
trinucleotide composition.
Compute trinucleotide statistics for all mutated G4s.
# r
library(tibble)
# Add unique G4 id
G4.mut.gr <- subsetByOverlaps(G4.score.ori.gr, snvs.ori.cluster.2.gr)
G4.mut.gr$G4.id <- paste0("G4.", rep(1:length(G4.mut.gr)))
G4.mut.seq <- DNAStringSet(G4.mut.gr$seq) # 2,518 sequences
# Initialise trinucleotide columns
G4.mut.tri.stats.df <- tibble(tri = trinuc.sum.df$tri)
for (i in 1:length(G4.mut.seq)) {
print(i/length(G4.mut.seq))
id.i <- G4.ori.gr$G4.id[i]
seq.i <- G4.mut.seq[i]
orientation.i <- G4.ori.gr$class[i]
if(orientation.i == "G4") {G4.seq.i <- seq.i} else {G4.seq.i <- reverseComplement(seq.i)}
G4.tri <- trinucleotideFrequency(G4.seq.i, step=1, as.prob = FALSE)
G4.tri.df <- t(G4.tri) %>% as.data.frame() %>% rownames_to_column("tri")
colnames(G4.tri.df) <- c("tri", id.i)
G4.mut.tri.stats.df <- G4.mut.tri.stats.df %>% left_join(G4.tri.df, by = "tri")
}
saveRDS(G4.mut.tri.stats.df, file = "./002_Mut_signatures_Pan_cancer/rds/G4.mut.tri.stats.df.rds")
Plot G4 trinucleotide composition.
library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load trinucleotide statistics
G4.mut.tri.stats.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/G4.mut.tri.stats.df.rds")
G4.all.mut.tri.stats.df <- cbind.data.frame(tri = G4.mut.tri.stats.df[,1], tri.all = rowSums(G4.mut.tri.stats.df[,-1]))
# Plot
G4.all.mut.tri.stats.plot <- G4.all.mut.tri.stats.df %>% mutate(rank = dense_rank(dplyr::desc(tri.all))) %>%
mutate(col = case_when(rank >= 10 ~ "1", T ~ "0")) %>%
ggplot(aes(x=rank, y=tri.all)) +
geom_point() + scale_y_log10(limits = c(10,20000)) +
geom_text(aes(label = tri, color = col), angle = 45) +
scale_color_manual(values = c("black", NA), na.value = NA) + ylab("Trinuclotide occurence (log10 scale)") + xlab("Rank") +
theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"), legend.position = "none")
G4.all.mut.tri.stats.plot

pdf("./Rplots/G4.tri.occurence.pdf", width=7, height=6, useDingbats=FALSE)
G4.all.mut.tri.stats.plot
dev.off()
## quartz_off_screen
## 2
Correct observed profile and assess strand bias
# r
library(plyranges)
# Load origin signatures
snvs.ori.mut.cluster <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.ori.mut.cluster.signatures.rds")
# Correct matrix
snvs.G4.cluster.2.corr.mut.mat <- snvs.ori.mut.cluster %>% as.data.frame() %>% dplyr::select(cluster.2) %>% tibble::rownames_to_column(var = "Tri.mut") %>% tidyr::separate(Tri.mut, c("D", "R", "A", "U"), sep = "\\[|>|]", remove = F) %>% mutate(tri = paste0(D, R, U, sep = "")) %>%
dplyr::select(-D, -R, -A, -U, -Tri.mut) %>% left_join(G4.all.mut.tri.stats.df, by = "tri") %>%
mutate(G4 = cluster.2/tri.all, G4.norm = G4/sum(G4)) %>% select(G4 = G4.norm)
# Add G4 mutational signature
snvs.ori.G4.mut.cluster <- cbind.data.frame(snvs.ori.mut.cluster, snvs.G4.cluster.2.corr.mut.mat) %>% as.matrix()
saveRDS(snvs.ori.G4.mut.cluster, "./002_Mut_signatures_Pan_cancer/rds/snvs.ori.mut.cluster.G4.signatures.rds")
Plot
library(dplyr)
library(MutationalPatterns)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load signatures
snvs.ori.mut.cluster.G4.signatures.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.ori.mut.cluster.G4.signatures.rds")
# Plot comparative profiles
plot_compare_profiles(snvs.ori.mut.cluster.G4.signatures.mat[,2], snvs.ori.mut.cluster.G4.signatures.mat[,6], profile_names = c("cluster.2", "G4"), condensed = TRUE)

pdf("./Rplots/snvs.ori.cluster.2.vs.G4.pdf", width=5, height=6, useDingbats=FALSE)
plot_compare_profiles(snvs.ori.mut.cluster.G4.signatures.mat[,2], snvs.ori.mut.cluster.G4.signatures.mat[,6], profile_names = c("cluster.2", "G4"), condensed = TRUE)
dev.off()
## quartz_off_screen
## 2
Contribution of
G-quadruplex to cluster 2 SBS
Assess the contribution of G4 to overall mutation rates in cluster
2
# R
################################################################################
# Map G4-forming sequences within 100 nt windows around origins
# Load origin 10kb domains
ori.10kb.df <- read.table("./Dataset/ori/ori.10kb.domain.hg19.NCBI.bed", sep="\t")
colnames(ori.10kb.df) <- c("seqnames", "start", "end", "EFF", "ori.id", "ori.width")
ori.10kb.gr <- makeGRangesFromDataFrame(ori.10kb.df, keep.extra.columns = T)
ori.10kb.gr <- ori.10kb.gr[seqnames(ori.10kb.gr) %in% c(1:22, "X", "Y")]
seqlevelsStyle(ori.10kb.gr) <- "UCSC"
# Recover sequences
ori.10kb.seq <- getSeq(Hsapiens, ori.10kb.gr)
# Map G4 forming sequences in each ori domain
G4.ori.map.df <- tibble()
for (i in 1:length(ori.10kb.gr)) {
print(i/length(ori.10kb.gr))
ori.id.i <- ori.10kb.gr$ori.id[i]
seqnames.i <- seqnames(ori.10kb.gr)[i]
ranges.i <- ranges(ori.10kb.gr)[i]
ori.midpoint.i <- round((start(ranges.i)+end(ranges.i))/2)
seq.i <- as.character(ori.10kb.seq[i])
# Identify G4s
G4.seq.i <- str_match_all(seq.i, "(?=([ACGT]{4}[ACT][G]{3,}[ACGT]{1,12}[G]{3,}[ACGT]{1,12}[G]{3,}[ACGT]{1,12}[G]{3,}[ACT][ACGT]{4}))")[[1]][, 2]
G4.seq.length.i <- nchar(G4.seq.i)
G4.seq.pos.i <- str_locate_all(seq.i, "(?=([ACGT]{4}[ACT][G]{3,}[ACGT]{1,12}[G]{3,}[ACGT]{1,12}[G]{3,}[ACGT]{1,12}[G]{3,}[ACT][ACGT]{4}))")[[1]]
G4.seq.start.i <- G4.seq.pos.i[,1]
G4.seq.end.i <- G4.seq.start.i+G4.seq.length.i
G4.ranges.start.i <- start(ranges.i)+G4.seq.start.i
G4.ranges.end.i <- start(ranges.i)+G4.seq.end.i
if (length(G4.seq.i) == 0) {G4.summary.i = tibble()} else {
G4.summary.i <- cbind.data.frame(seqnames = seqnames.i, start = G4.ranges.start.i, end = G4.ranges.end.i)}
# Identify C4s
C4.seq.i <- str_match_all(seq.i, "(?=([ACGT]{4}[AGT][C]{3,}[ACGT]{1,12}[C]{3,}[ACGT]{1,12}[C]{3,}[ACGT]{1,12}[C]{3,}[AGT][ACGT]{4}))")[[1]][, 2]
C4.seq.length.i <- nchar(C4.seq.i)
C4.seq.pos.i <- str_locate_all(seq.i, "(?=([ACGT]{4}[AGT][C]{3,}[ACGT]{1,12}[C]{3,}[ACGT]{1,12}[C]{3,}[ACGT]{1,12}[C]{3,}[AGT][ACGT]{4}))")[[1]]
C4.seq.start.i <- C4.seq.pos.i[,1]
C4.seq.end.i <- C4.seq.start.i+C4.seq.length.i
C4.ranges.start.i <- start(ranges.i)+C4.seq.start.i
C4.ranges.end.i <- start(ranges.i)+C4.seq.end.i
if (length(C4.seq.i) == 0) {C4.summary.i = tibble()} else {
C4.summary.i <- cbind.data.frame(seqnames = seqnames.i, start = C4.ranges.start.i, end = C4.ranges.end.i)}
# Combine all ranges
all.summary.i <- rbind(G4.summary.i, C4.summary.i)
if (nrow(all.summary.i) == 0) {all.df.i = tibble()} else {
all.gr.i <- reduce(makeGRangesFromDataFrame(all.summary.i))
# Compute midpoints and compute distance to origin
all.df.i <- as.data.frame(all.gr.i) %>% mutate(G4.midpoint = round((start+end)/2), ori.id = ori.id.i, ori.midpoint = ori.midpoint.i, G4.ori.dist = G4.midpoint-ori.midpoint)}
# Append results
G4.ori.map.df <- rbind(G4.ori.map.df, all.df.i)
}
saveRDS(G4.ori.map.df, "./002_Mut_signatures_Pan_cancer/rds/G4.ori.map.df.rds")
G4.ori.map.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/G4.ori.map.df.rds")
hist(G4.ori.map.df$G4.ori.dist, breaks = 100)
G4.ori.map.gr <- makeGRangesFromDataFrame(G4.ori.map.df, keep.extra.columns = T)
################################################################################
# Compute mutational burden at origins considering G4 forming sequences or not
# Load cluster 2 snvs coordinates
cluster.2.grl <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/cluster.2.grl.rds")
cluster.2.gr <- unlist(cluster.2.grl)
# Filter out snvs overlapping with G4 forming sequences
cluster.2.G4.overlap <- findOverlaps(cluster.2.gr, G4.ori.map.gr)
cluster.2.G4.gr <- cluster.2.gr[-queryHits(cluster.2.G4.overlap)]
# Compute snvs distances to origins
ori.df <- read.table("./Dataset/ori/ori.inter.hg19.NCBI.bed")
colnames(ori.df) <- c("seqnames", "start", "end", "EFF", "ori.id", "ori.width")
ori.gr <- makeGRangesFromDataFrame(ori.df, keep.extra.columns = T)
seqlevelsStyle(ori.gr) <- "UCSC"
snvs.cluster.2.ori.nearest <- nearest(cluster.2.gr, ori.gr)
snvs.cluster.2.ori.nearest[is.na(snvs.cluster.2.ori.nearest)] <- 1 # Arbitrarily assign pos 1 to NA values
snvs.pos <- as.data.frame(ranges(cluster.2.gr))$start
ori.pos <- ori.gr[snvs.cluster.2.ori.nearest]
snvs.cluster.2.ori.nearest.df <- cbind.data.frame(snvs.pos, ori.pos) %>% mutate(dist = snvs.pos - start) %>%
filter(dist >= -10000 & dist <= 10000) %>% mutate(dist = (as.numeric(cut(dist, breaks = seq(-10000, 10000, 100)))-100)*100) %>%
group_by(dist) %>% summarise(total.snvs.count = n())
snvs.cluster.2.G4.ori.nearest <- nearest(cluster.2.G4.gr, ori.gr)
snvs.cluster.2.G4.ori.nearest[is.na(snvs.cluster.2.G4.ori.nearest)] <- 1 # Arbitrarily assign pos 1 to NA values
snvs.pos <- as.data.frame(ranges(cluster.2.G4.gr))$start
ori.pos <- ori.gr[snvs.cluster.2.G4.ori.nearest]
snvs.cluster.2.G4.ori.nearest.df <- cbind.data.frame(snvs.pos, ori.pos) %>% mutate(dist = snvs.pos - start) %>%
filter(dist >= -10000 & dist <= 10000) %>% mutate(dist = (as.numeric(cut(dist, breaks = seq(-10000, 10000, 100)))-100)*100) %>%
group_by(dist) %>% summarise(snvs.no.G4.count = n())
snvs.cluster.2.G4.summary <- snvs.cluster.2.ori.nearest.df %>% left_join(snvs.cluster.2.G4.ori.nearest.df)
plot(snvs.cluster.2.ori.nearest.df$dist, snvs.cluster.2.ori.nearest.df$total.snvs.count, col = "red", xlim = c(-5000, 5000))
points(snvs.cluster.2.G4.ori.nearest.df$dist, snvs.cluster.2.G4.ori.nearest.df$snvs.no.G4.count, col = "blue")
# Correct snvs count per base coverage
ori.10kb.domain.100nt.split.gr <- import("./Dataset/ori/ori.10kb.domain.100nt.split.hg19.NCBI.bed")
my.chr <- c(1:22, "X", "Y")
seqlevelsStyle(ori.10kb.domain.100nt.split.gr) <- "UCSC"
base.coverage.df <- as.data.frame(ori.10kb.domain.100nt.split.gr) %>% group_by(name) %>%
summarise(base = sum(width)) %>% mutate(dist = (as.numeric(name)-100)*100)
ori.G4.hits <- findOverlaps(ori.10kb.domain.100nt.split.gr, G4.ori.map.gr)
ori.G4.intersect <- pintersect(ori.10kb.domain.100nt.split.gr[queryHits(hits)],
G4.ori.map.gr[subjectHits(hits)])
G4.coverage.df <- as.data.frame(ori.G4.intersect) %>% group_by(name) %>%
summarise(G4.cov = sum(width)) %>% mutate(dist = (as.numeric(name)-100)*100) %>%
left_join(base.coverage.df) %>% mutate(G4.base = base-G4.cov) %>% select(dist, base, G4.base)
snvs.cluster.2.G4.summary.df <- snvs.cluster.2.G4.summary %>% left_join(G4.coverage.df) %>%
mutate(total.snvs.count.norm = total.snvs.count/base, snvs.no.G4.count.norm = snvs.no.G4.count/G4.base)
saveRDS(snvs.cluster.2.G4.summary.df, "./002_Mut_signatures_Pan_cancer/rds/snvs.cluster.2.G4.summary.df.rds")
plot(snvs.cluster.2.G4.summary$dist, snvs.cluster.2.G4.summary$total.snvs.count.norm, type = "l", col = "red", xlim = c(-5000, 5000))
points(snvs.cluster.2.G4.summary$dist, snvs.cluster.2.G4.summary$snvs.no.G4.count.norm, type = "l", col = "blue")
snvs.cluster.2.G4.summary.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.cluster.2.G4.summary.df.rds")
snvs.cluster.2.G4.summary.plot <- snvs.cluster.2.G4.summary %>%
select(dist, all = total.snvs.count.norm, no.G4= snvs.no.G4.count.norm) %>%
mutate(all = all*10e3, no.G4 = no.G4*10e3) %>%
gather(class, value, -dist) %>%
ggplot(aes(x = dist, y = value, color = class)) +
geom_line() + xlim(-5000, 5000) +
scale_color_manual(values = c("#514B76", "#D56114")) +
xlab("Distance from origin (bp)") + ylab("Mutational burden (mutations per kb)") +
theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"))
snvs.cluster.2.G4.summary.plot
################################################################################
# Assess the contribution of G4 mutagenesis per cancer samples
# Load G4 coordinates
G4.ori.map.gr <- makeGRangesFromDataFrame(G4.ori.map.df, keep.extra.columns = T)
seqlevelsStyle(G4.ori.map.gr) <- "NCBI"
# Load manifest
SSM.manifest.df <- read.table(gzfile("./Dataset/ICGC/SSM/manifest.1698683356752.tar.gz"), skip = 1)
SSM.manifest.df <- SSM.manifest.df %>% dplyr::select(V9, V10, V2, V3, V5)
colnames(SSM.manifest.df) <- c("donor.id", "project", "file.id", "object.id", "file.name")
# Select cluster 2 samples
SSM.cluster.2.manifest.df <- SSM.manifest.df %>% filter(donor.id %in% names(cluster.2.grl))
# Open ori and flanks coordinates
ori.1kb.domains.gr <- import("./Dataset/ori/BED/ori.1kb.hg19.bed")
ori.flanks.gr <- import("./Dataset/ori/BED/ori.flanks.hg19.bed")
# List vcf files
vcf.files <- list.files("./Dataset/ICGC/SSM/VCF/") # 3,892 files
vcf.files <- vcf.files[!grepl(".idx", vcf.files)] # 3,892 snvs vcf files
cluster.2.vcf.files <- vcf.files[vcf.files %in% SSM.cluster.2.manifest.df$file.name] # 46 files
# Count snvs
snvs.cluster.2.count.df <- tibble()
for (i in 1:length(cluster.2.vcf.files)) {
vcf.files.i <- cluster.2.vcf.files[i]
print(vcf.files.i)
df.i <- SSM.cluster.2.manifest.df %>% filter(file.name == vcf.files.i)
donor.i <- df.i$donor.id
path.i <- paste("./Dataset/ICGC/SSM/VCF/", vcf.files.i, sep = "")
vcf.i <- read.table(gzfile(path.i), comment.char = "#") %>% dplyr::select(V1, V2, V4, V5)
bed.i <- vcf.i %>% mutate(end = V2 + 1) %>% dplyr::select(seqnames = V1, start = V2, end, REF = V4, ALT = V5)
gr.i <- makeGRangesFromDataFrame(bed.i)
# Total count
ori.overlap.i <- subsetByOverlaps(gr.i, ori.1kb.domains.gr)
snvs.ori.count.i <- length(ori.overlap.i)
flanks.overlap.i <- subsetByOverlaps(gr.i, ori.flanks.gr)
snvs.flanks.count.i <- length(flanks.overlap.i)
# Count filtering G4 overlapping snvs
ori.G4.overlap.i <- findOverlaps(ori.overlap.i, G4.ori.map.gr)
ori.G4.overlap.2.i <- ori.overlap.i[-queryHits(ori.G4.overlap.i)]
snvs.ori.no.G4.count.i <- length(ori.G4.overlap.2.i)
flanks.G4.overlap.i <- findOverlaps(flanks.overlap.i, G4.ori.map.gr)
flanks.G4.overlap.2.i <- flanks.overlap.i[-queryHits(flanks.G4.overlap.i)]
snvs.flanks.no.G4.count.i <- length(flanks.G4.overlap.2.i)
# Combine information
df.summary.i <- cbind.data.frame(donor = donor.i, snvs.ori.count = snvs.ori.count.i, snvs.flanks.count = snvs.flanks.count.i, snvs.ori.no.G4.count = snvs.ori.no.G4.count.i, snvs.flanks.no.G4.count = snvs.flanks.no.G4.count.i)
snvs.cluster.2.count.df <- rbind(snvs.cluster.2.count.df, df.summary.i)
}
saveRDS(snvs.cluster.2.count.df, "./002_Mut_signatures_Pan_cancer/rds/snvs.cluster.2.count.df.rds")
# Compute mutational burden
ori.df <- snvs.cluster.2.G4.summary.2 %>% filter(dist >= 1000 & dist <= 1000)
flanks.df <- snvs.cluster.2.G4.summary.2 %>% filter(dist < 1000 | dist > 1000)
ori.base <- sum(ori.df$base)
ori.no.G4.base <- sum(ori.df$G4.base)
flanks.base <- sum(flanks.df$base)
flanks.no.G4.base <- sum(flanks.df$G4.base)
snvs.cluster.2.mut.burden.df <- snvs.cluster.2.count.df %>%
mutate(snvs.ori.burden = snvs.ori.count/ori.base,
snvs.ori.no.G4.burden = snvs.ori.no.G4.count/ori.no.G4.base,
snvs.flanks.burden = snvs.flanks.count/flanks.base,
snvs.flanks.no.G4.burden = snvs.flanks.no.G4.count/flanks.no.G4.base)
saveRDS(snvs.cluster.2.mut.burden.df, "./002_Mut_signatures_Pan_cancer/rds/snvs.cluster.2.mut.burden.df.rds")
# Compute contribution of G4 forming sequences to origin mutational burden
snvs.cluster.2.mut.burden.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.cluster.2.mut.burden.df.rds")
snvs.cluster.2.G4.contribution.df <- snvs.cluster.2.mut.burden.df %>%
mutate(ori.ratio = snvs.ori.no.G4.burden/snvs.ori.burden, flanks.ratio = snvs.flanks.no.G4.burden/snvs.flanks.burden) %>%
mutate(ori = (1-ori.ratio)*100, flanks = (1-flanks.ratio)*100)
snvs.cluster.2.G4.contribution.plot <- snvs.cluster.2.G4.contribution.df %>% select(donor, ori, flanks) %>%
gather(class, value, -donor) %>%
ggplot(aes(x=class, y=value, alpha = 0.3)) +
geom_jitter(color="black", size=0.5, alpha=0.4) + ylim(-2, 100) +
geom_boxplot(width = 0.5, fill="#D56114") + ylab("Contribution of G4 sequences\nto mutational burden (%)") +
theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"), legend.position = "none", axis.title.x=element_blank(), axis.text.x=element_text(angle=45,hjust=1))
snvs.cluster.2.G4.contribution.plot
summary(snvs.cluster.2.G4.contribution.df$ori)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 4.83 19.10 31.65 33.02 46.52 72.52
# G4s contribute in average to ~ 33 % and up to 72 % to mutagenesis at origins
# Prepare and save final plot
pdf("./Manuscript/Nature Communications/Rplots/G4.mut.burden.pdf", width=12, height=4, useDingbats=FALSE)
ggarrange(snvs.cluster.2.G4.summary.plot, snvs.cluster.2.G4.contribution.plot, nrow = 1)
dev.off()
Cluster 2 SBS
signature strand bias
Assess potential strand biases associated with cluster 2 SBS
signature.
# r
# Load cluster 2 snvs gr list
cluster.2.snvs.grl <- readRDS("./002_Mut_signatures_Pan_cancer/sig/cluster.2.snvs.grl.rds")
# Split pyrimidine and purine mutations
cluster.2.pyr.grl <- cluster.2.snvs.grl
cluster.2.pur.grl <- cluster.2.snvs.grl
for (i in 1:length(cluster.2.snvs.grl)) {
print(i/length(cluster.2.snvs.grl))
gr.i <- cluster.2.snvs.grl[i]
bin.i <- names(gr.i)
unlist.gr.i <- unlist(gr.i)
pyr.gr.i <- unlist.gr.i[as.character(unlist.gr.i$REF) == "C" | as.character(unlist.gr.i$REF) == "T"]
pyr.gr.comp.i <- GRangesList(pyr.gr.i, compress=TRUE)
cluster.2.pyr.grl[i] <- pyr.gr.comp.i
pur.gr.i <- unlist.gr.i[as.character(unlist.gr.i$REF) == "G" | as.character(unlist.gr.i$REF) == "A"]
pur.gr.comp.i <- GRangesList(pur.gr.i, compress=TRUE)
cluster.2.pur.grl[i] <- pur.gr.comp.i
}
saveRDS(cluster.2.pyr.grl, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.2.pyr.grl.rds")
saveRDS(cluster.2.pur.grl, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.2.pur.grl.rds")
# Compute mutational matrices
cluster.2.pyr.mat <- mut_matrix(vcf_list = cluster.2.pyr.grl, ref_genome = ref.genome)
cluster.2.pur.mat <- mut_matrix(vcf_list = cluster.2.pur.grl, ref_genome = ref.genome)
# Fit signatures
cluster.2.pyr.fit <- fit_to_signatures(cluster.2.pyr.mat, snvs.ori.G4.mut.cluster)
cluster.2.pur.fit <- fit_to_signatures(cluster.2.pur.mat, snvs.ori.G4.mut.cluster)
# Format
cluster.2.pyr.fit.df <- as.data.frame(cluster.2.pyr.fit$contribution)
cluster.2.pyr.fit.df <- as.data.frame(t(cluster.2.pyr.fit.df)) %>% rownames_to_column(var = "dist") %>% select(dist, value = cluster.2) %>% mutate(strand = "plus")
cluster.2.pur.fit.df <- as.data.frame(cluster.2.pur.fit$contribution)
cluster.2.pur.fit.df <- as.data.frame(t(cluster.2.pur.fit.df)) %>% rownames_to_column(var = "dist") %>% select(dist, value = cluster.2) %>% mutate(strand = "minus")
cluster.2.strand.fit.df <- rbind(cluster.2.pyr.fit.df, cluster.2.pur.fit.df)
cluster.2.strand.fit.df$dist <- as.numeric(cluster.2.strand.fit.df$dist)
saveRDS(cluster.2.strand.fit.df, file = "./002_Mut_signatures_Pan_cancer/rds/cluster.2.strand.fit.df.rds")
Plot
library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")
cluster.2.strand.fit.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/cluster.2.strand.fit.df.rds")
cluster.2.strand.fit.plot <- cluster.2.strand.fit.df %>% ggplot(aes(x = dist, y = value, color = strand)) +
geom_line() + xlim(-5000, 5000) +
scale_color_manual(values = c("#514B76", "#D56114")) +
xlab("Distance from origin (bp)") + ylab("Absolute SBS\ncluster 1 contribution") +
theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"))
cluster.2.strand.fit.plot

pdf("./Rplots/cluster.2.strand.fit.plot.pdf", width=7, height=6, useDingbats=FALSE)
cluster.2.strand.fit.plot
dev.off()
## quartz_off_screen
## 2
This signature do not show any strand bias.
Patterns of mutation
at G-quadruplexes
Assess the distribution of snvs within G4s when oriented toward the
direction of replication.
# r
# Compute distances of identified G4s to origins and assign a replication strand (leading or lagging).
G4.score.ori.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/G4.score.ori.df.rds")
G4.score.ori.gr <- makeGRangesFromDataFrame(G4.score.ori.df, keep.extra.columns = T)
ori.inter.dist.df <- read.table("./Dataset/ori/ori.inter.hg19.NCBI.bed", sep="\t")
colnames(ori.inter.dist.df) <- c("seqnames", "start", "end", "EFF", "ori.id", "ori.width")
ori.inter.dist.gr <- makeGRangesFromDataFrame(ori.inter.dist.df, keep.extra.columns = T)
seqlevelsStyle(ori.inter.dist.gr) <- "UCSC"
G4.ori.dist <- nearest(G4.score.ori.gr, ori.inter.dist.gr)
ori.dist.gr <- ori.inter.dist.gr[G4.ori.dist]
G4.ori.dist.gr <- as.data.frame(G4.score.ori.gr) %>% mutate(ori.pos = start(ori.dist.gr), ori.dist = start-ori.pos) %>%
mutate(ori.pos = case_when(ori.dist <= 0 ~ "upstream", T ~ "downtream")) %>%
mutate(rep.strand = case_when((ori.pos == "upstream" & class == "G4") ~ "leading",
(ori.pos == "downtream" & class == "C4") ~ "leading",
T ~ "lagging")) %>%
makeGRangesFromDataFrame(keep.extra.columns = T)
saveRDS(G4.ori.dist.gr, file = "./002_Mut_signatures_Pan_cancer/rds/G4.ori.dist.gr.rds")
# Compute oriented distances of mapped snvs
# G4 motif start at 6nt of the reported position due to the specific regex we used, so we add/remove 7 to the mapped position to have the start of the G4 motif at position 0
snvs.G4.gr <- snvs.ori.cluster.2.gr
snvs.G4.dist <- nearest(snvs.G4.gr, G4.ori.dist.gr)
G4.start <- start(G4.ori.dist.gr[snvs.G4.dist])+7
G4.end <- end(G4.ori.dist.gr[snvs.G4.dist])-7
G4.ori.dist <- G4.ori.dist.gr[snvs.G4.dist]$ori.dist
G4.class <- G4.ori.dist.gr[snvs.G4.dist]$class
snvs.G4.dist.df <- cbind.data.frame(as.data.frame(snvs.G4.gr), G4.start, G4.end, G4.ori.dist, G4.class) %>%
mutate(G4.snv.dist = case_when(G4.ori.dist < 0 ~ G4.end-start,
G4.ori.dist > 0 ~ G4.start-start))
saveRDS(snvs.G4.dist.df, file = "./002_Mut_signatures_Pan_cancer/rds/snvs.G4.dist.df.rds")
# Extract G4 domain and compute G frequency for G4 visualisation
G4.up.df <- snvs.G4.dist.df %>% filter(G4.ori.dist > 0 & G4.class == "G4") %>% dplyr::select(seqnames, start = G4.start) %>%
mutate(start = start-52, end = start+150) # rephase and expand domains
G4.up.gr <- makeGRangesFromDataFrame(G4.up.df)
G4.up.views <- Views(Hsapiens, G4.up.gr)
G4.up.consMat <- consensusMatrix(G4.up.views, as.prob=T, shift=0L, baseOnly=TRUE, width=150)
G4.up.consMat.df <- cbind.data.frame(dist = seq(-49,100,1), G.freq.up = G4.up.consMat[3,])
G4.down.df <- snvs.G4.dist.df %>% filter(G4.ori.dist < 0 & G4.class == "G4") %>% dplyr::select(seqnames, start = G4.end) %>%
mutate(end = start+51, start = end-150, ) # rephase and expand domains
G4.down.gr <- makeGRangesFromDataFrame(G4.down.df)
G4.down.views <- Views(Hsapiens, G4.down.gr)
G4.down.consMat <- consensusMatrix(G4.down.views, as.prob=T, shift=0L, baseOnly=TRUE, width=150)
G4.down.consMat.df <- cbind.data.frame(dist = rev(seq(-50,99,1)), G.freq.down = G4.down.consMat[3,])
# Combine information
G4.consMat.df <- G4.up.consMat.df %>% left_join(G4.down.consMat.df, by = "dist") %>% mutate(G.freq = (G.freq.up+G.freq.down)/2)
saveRDS(G4.consMat.df, file = "./002_Mut_signatures_Pan_cancer/rds/G4.consMat.df.rds")
Plot
library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load data
snvs.G4.dist.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/snvs.G4.dist.df.rds")
G4.consMat.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/G4.consMat.df.rds")
# Plot mutation patterns at G4s
snvs.G4.dist.plot <- snvs.G4.dist.df %>% filter(G4.snv.dist >= -50 & G4.snv.dist <= 100) %>% ggplot(aes(x=G4.snv.dist)) +
geom_histogram(binwidth=1, fill="#D56014", color="#010101", linewidth = 0.25) + xlab("Position from G4 (nt)") + ylab("snvs count") + xlim(-50,50) +
theme_bw() + theme(aspect.ratio=0.5, panel.grid.minor = element_line(linetype = "dotted"))
snvs.G4.dist.plot

# Plot G statistics at phases G4s
G4.consMat.plot <- G4.consMat.df %>% ggplot(aes(x=dist, y=G.freq)) +
geom_line(color = "#4F4A75") + xlab("Position from G4 (nt)") + ylab("G frequency") + xlim(-50,50) +
theme_bw() + theme(aspect.ratio=0.5, panel.grid.minor = element_line(linetype = "dotted"))
G4.consMat.plot

# Save plots
pdf("./Rplots/G4.mut.density.pdf", width=8, height=6, useDingbats=FALSE)
ggarrange(snvs.G4.dist.plot, G4.consMat.plot, nrow = 2)
dev.off()
## quartz_off_screen
## 2
Polymerase usage at
origins
To absence of strand bias for mutagenesis at G4s in the vicinity of
origins suggests that these sites are being replicated by the same
polymerase.
Fit COSMIC signatures associated with deficient pol epsilon and delta
activity. From Robinson et al. Nature Genetics 2021, 53, 1434–1442 (https://www.nature.com/articles/s41588-021-00930-y).
Download individual signatures SBS10a,b (POLE) and SBS10c,d(POLD)
from the COSMIC signature websites: https://cancer.sanger.ac.uk/signatures/sbs/ and fit
their contribution to constitutive origins.
# r
SBS10a <- read.csv("./Dataset/COSMIC/SBS/v3.2_SBS10a_PROFILE.txt", sep = "\t") %>% dplyr::select(tri = X, SBS10a = SBS10a_GRCh37)
SBS10b <- read.csv("./Dataset/COSMIC/SBS/v3.2_SBS10b_PROFILE.txt", sep = "\t") %>% dplyr::select(tri = X, SBS10b = SBS10b_GRCh37)
SBS10c <- read.csv("./Dataset/COSMIC/SBS/v3.2_SBS10c_PROFILE_cxbZBIy.txt", sep = "\t") %>% dplyr::select(tri = X, SBS10c = SBS10c_GRCh37)
SBS10d <- read.csv("./Dataset/COSMIC/SBS/v3.2_SBS10d_PROFILE_IJAo3WV.txt", sep = "\t") %>% dplyr::select(tri = X, SBS10d = SBS10d_GRCh37)
SBS.pol <- SBS10a %>% full_join(SBS10b) %>% full_join(SBS10c) %>% full_join(SBS10d) %>% dplyr::select(-tri) %>% as.matrix()
cluster.2.Pol.fit <- fit_to_signatures_strict(cluster.2.mut.mat, SBS.pol, max_delta = 0.004)
cluster.2.Pol.fit.df <- as.data.frame(cluster.2.Pol.fit$fit_res$contribution)
cluster.2.Pol.fit.df <- as.data.frame(t(cluster.2.Pol.fit.df))
cluster.2.Pol.fit.filter.df <- cluster.2.Pol.fit.df %>% dplyr::select_if(~any(. >= 10)) %>% rownames_to_column(var = "dist") %>% gather("sig", "value", -dist)
saveRDS(cluster.2.Pol.fit.filter.df, "./002_Mut_signatures_Pan_cancer/rds/cluster.2.Pol.fit.filter.df.rds")
Plot
library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")
cluster.2.Pol.fit.filter.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/cluster.2.Pol.fit.filter.df.rds")
cluster.2.Pol.epsilon.fit.plot <- cluster.2.Pol.fit.filter.df %>% filter(sig == "SBS10b") %>%
ggplot(aes(x=as.numeric(dist), y=value, colour=sig)) + geom_line() +
scale_color_manual(values = c("#0775B6")) + ylim(-10,40) +
xlim(-5000,5000) +
xlab("Distance from origin (bp)") + ylab("Polymerase signature exposure\n(absolute contribution)") +
theme_bw() + theme(aspect.ratio=1.0, panel.grid.minor = element_line(linetype = "dotted"))
cluster.2.Pol.epsilon.fit.plot

cluster.2.Pol.delta.fit.plot <- cluster.2.Pol.fit.filter.df %>% filter(sig == "SBS10c") %>%
ggplot(aes(x=as.numeric(dist), y=value, colour=sig)) + geom_line() +
scale_color_manual(values = c("#D56114")) + ylim(20,70) +
xlim(-5000,5000) +
xlab("Distance from origin (bp)") + ylab("Polymerase signature exposure\n(absolute contribution)") +
theme_bw() + theme(aspect.ratio=1.0, panel.grid.minor = element_line(linetype = "dotted"))
cluster.2.Pol.delta.fit.plot

pdf("./Rplots/cluster.2.Pol.fit.plot.pdf", width=10, height=6, useDingbats=FALSE)
ggarrange(cluster.2.Pol.epsilon.fit.plot, cluster.2.Pol.delta.fit.plot, nrow = 1, ncol = 2)
dev.off()
## quartz_off_screen
## 2
AID hyperactivity at
origins in malignant lymphomas
Cluster 4 of mutational signatures suggest differential AID activity
at replication origins compared to background.
In this section, we aim to characterise AID activity at origins and
correlation with local variation in mutation burden.
Mutations at
canonical AID motifs
AID activity is assessed by tracking C to G/T mutations within AID
specific WRCY motifs, where W = A or T, R = purine and Y = pyrimidine
(see Hernández-Verdin et al. npj Precision Oncology, 2022, 6, 89; Kasar
et al. Nature commun. 2015, 6, 8866 for examples).
Compute mutational profiles at origins and origin flanks in larger
contexts from genomes of cluster 4.
# r
# Load gr lists
snvs.ori.grl <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.ori.grl.rds")
snvs.flanks.ori.grl <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.flanks.ori.grl.rds")
# Select cluster 4 genomes
clusters.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/clusters.df.rds")
cluster.4.id <- (clusters.df %>% filter(cluster == 4))$donor.id # 23 genomes
snvs.ori.cluster.4.grl <- snvs.ori.grl[names(snvs.ori.grl) %in% cluster.4.id]
snvs.flanks.cluster.4.grl <- snvs.flanks.ori.grl[names(snvs.flanks.ori.grl) %in% cluster.4.id]
# Extract contexts
# Load genome
ref.genome <- BSgenome.Hsapiens.UCSC.hg19
snvs.ori.cluster.4.mut.mat.ext.context <- mut_matrix(unlist(snvs.ori.cluster.4.grl), ref_genome = ref.genome, extension = 2)
colnames(snvs.ori.cluster.4.mut.mat.ext.context) <- c("ori")
snvs.flanks.cluster.4.mut.mat.ext.context <- mut_matrix(unlist(snvs.flanks.cluster.4.grl), ref_genome = ref.genome, extension = 2)
colnames(snvs.flanks.cluster.4.mut.mat.ext.context) <- c("flanks")
snvs.cluster.4.mut.mat.ext.context <- cbind(snvs.ori.cluster.4.mut.mat.ext.context, snvs.flanks.cluster.4.mut.mat.ext.context)
saveRDS(snvs.cluster.4.mut.mat.ext.context, file = "./002_Mut_signatures_Pan_cancer/sig/snvs.cluster.4.mut.mat.ext.context.rds")
Plot mutation extended profiles
library(dplyr)
library(ggplot2)
library(MutationalPatterns)
library(ggpubr)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load mutation matrix
snvs.cluster.4.mut.mat.ext.context <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/snvs.cluster.4.mut.mat.ext.context.rds")
ext.profile.plot <- plot_profile_heatmap(snvs.cluster.4.mut.mat.ext.context, by = c("ori", "flanks")) + theme(aspect.ratio=0.7)
river.plot <- plot_river(snvs.cluster.4.mut.mat.ext.context) + theme(aspect.ratio=0.4)
ext.profile.plot

river.plot

pdf("./Rplots/cluster.4.extended.profile.pdf", width=15, height=12, useDingbats=FALSE)
ggarrange(ext.profile.plot, river.plot, nrow = 2)
dev.off()
## quartz_off_screen
## 2
Extract motif sequences for DNA logos
# r
# Prepare df
snvs.cluster.logo.df <- as.data.frame(snvs.cluster.4.mut.mat.ext.context)
context <- data.frame(do.call(rbind, strsplit(as.character(row.names(snvs.cluster.logo.df)), ""))) %>% dplyr::select(-X3, -X5, -X7)
colnames(context) <- c("u2", "u1", "REF", "ALT", "d1", "d2")
snvs.cluster.logo.df <- cbind.data.frame(snvs.cluster.logo.df, context)
# Prepare matrices for logos
# At origins
ori.logo.mat.u2 <- as.data.frame(table(rep(snvs.cluster.logo.df$u2, snvs.cluster.logo.df$ori))) %>% rename(seq = Var1, u2 = Freq)
ori.logo.mat.u1 <- as.data.frame(table(rep(snvs.cluster.logo.df$u1, snvs.cluster.logo.df$ori))) %>% rename(seq = Var1, u1 = Freq)
ori.logo.mat.REF <- as.data.frame(table(rep(snvs.cluster.logo.df$REF, snvs.cluster.logo.df$ori))) %>% rename(seq = Var1, REF = Freq)
ori.logo.mat.d1 <- as.data.frame(table(rep(snvs.cluster.logo.df$d1, snvs.cluster.logo.df$ori))) %>% rename(seq = Var1, d1 = Freq)
ori.logo.mat.d2 <- as.data.frame(table(rep(snvs.cluster.logo.df$d2, snvs.cluster.logo.df$ori))) %>% rename(seq = Var1, d2 = Freq)
ori.logo.mat <- ori.logo.mat.u2 %>% left_join(ori.logo.mat.u1) %>% left_join(ori.logo.mat.REF) %>% left_join(ori.logo.mat.d1) %>% left_join(ori.logo.mat.d2) %>% dplyr::select(-seq)
row.names(ori.logo.mat) <- c("A", "C", "G", "T")
saveRDS(ori.logo.mat, file = "./002_Mut_signatures_Pan_cancer/sig/ori.logo.mat.rds")
# At origin flanks
flanks.logo.mat.u2 <- as.data.frame(table(rep(snvs.cluster.logo.df$u2, snvs.cluster.logo.df$flanks))) %>% rename(seq = Var1, u2 = Freq)
flanks.logo.mat.u1 <- as.data.frame(table(rep(snvs.cluster.logo.df$u1, snvs.cluster.logo.df$flanks))) %>% rename(seq = Var1, u1 = Freq)
flanks.logo.mat.REF <- as.data.frame(table(rep(snvs.cluster.logo.df$REF, snvs.cluster.logo.df$flanks))) %>% rename(seq = Var1, REF = Freq)
flanks.logo.mat.d1 <- as.data.frame(table(rep(snvs.cluster.logo.df$d1, snvs.cluster.logo.df$flanks))) %>% rename(seq = Var1, d1 = Freq)
flanks.logo.mat.d2 <- as.data.frame(table(rep(snvs.cluster.logo.df$d2, snvs.cluster.logo.df$flanks))) %>% rename(seq = Var1, d2 = Freq)
flanks.logo.mat <- flanks.logo.mat.u2 %>% left_join(flanks.logo.mat.u1) %>% left_join(flanks.logo.mat.REF) %>% left_join(flanks.logo.mat.d1) %>% left_join(flanks.logo.mat.d2) %>% dplyr::select(-seq)
row.names(flanks.logo.mat) <- c("A", "C", "G", "T")
saveRDS(flanks.logo.mat, file = "./002_Mut_signatures_Pan_cancer/sig/flanks.logo.mat.rds")
Plot DNA logos
library(dplyr)
library(ggplot2)
library(ggseqlogo)
library(ggpubr)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load data
ori.logo.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/ori.logo.mat.rds")
flanks.logo.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/flanks.logo.mat.rds")
logo.col <- make_col_scheme(chars=c('A', 'T', 'C', 'G'),cols=c('#ADCC55', '#DF1F18', '#33BAED', '#E6A004'))
logo.ori <- ggplot() + geom_logo(as.matrix(ori.logo.mat), col_scheme=logo.col, method = "probability") + theme_logo() + theme(aspect.ratio=0.4) + ggtitle("origins")
logo.flanks <- ggplot() + geom_logo(as.matrix(flanks.logo.mat), col_scheme=logo.col, method = "probability") + theme_logo() + theme(aspect.ratio=0.4) + ggtitle("flanks")
pdf("./Rplots/cluster.4.DNA.logo.pdf", width=5, height=5, useDingbats=FALSE)
ggarrange(logo.ori, logo.flanks, nrow = 2)
dev.off()
## quartz_off_screen
## 2
Compute the ratio of mutations occurring at AID canonical WRCY motifs
over total mutations in bins of 100bp around origins
# r
# Load grange lists
cluster.4.snvs.bin.grl <- readRDS("./002_Mut_signatures_Pan_cancer/sig/cluster.4.snvs.grl.rds")
# Extract contexts
snvs.ori.cluster.4.mut.mat.bin.ext.context <- mut_matrix(cluster.4.snvs.bin.grl, ref_genome = ref.genome, extension = 2)
# Define AID canonical WR[C>T]YN and WR[C>G]YN motifs (W = A or T, R = purine and Y = pyrimidine)
AID.can.motifs <- expand.grid(c("A", "T"), c("A", "G"), c("[C>T]", "[C>G]"), c("C", "T"), c("A", "C", "G", "T")) %>% mutate(AID.context = paste0(Var1, Var2, Var3, Var4, Var5))
length(AID.can.motifs$AID.context) # 64 motifs
# Compute ratio
cluster.4.AID.ratio.df <- tibble()
for (i in 1:ncol(snvs.ori.cluster.4.mut.mat.bin.ext.context)) {
print(i/ncol(snvs.ori.cluster.4.mut.mat.bin.ext.context))
dist.i <- as.numeric(colnames(snvs.ori.cluster.4.mut.mat.bin.ext.context)[i])
mut.df.i <- cbind.data.frame(context = rownames(snvs.ori.cluster.4.mut.mat.bin.ext.context), value = as.vector(snvs.ori.cluster.4.mut.mat.bin.ext.context[,i]))
count.AID.cont.i <- mut.df.i %>% filter(context %in% AID.can.motifs$AID.context)
count.non.AID.cont.i <- mut.df.i %>% filter(!context %in% AID.can.motifs$AID.context)
ratio.AID.i <- sum(count.AID.cont.i$value) / (sum(count.AID.cont.i$value) + sum(count.non.AID.cont.i$value))
ratio.df.i <- cbind.data.frame(dist = dist.i, ratio.AID = ratio.AID.i)
cluster.4.AID.ratio.df <- rbind(cluster.4.AID.ratio.df, ratio.df.i)
}
saveRDS(cluster.4.AID.ratio.df, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.4.AID.ratio.df.rds")
# As a control, compute the ratio of Cs in a AID canonical motifs compared to the total number of C in 100bp bins around origins
# Load domain/bin coordinates
ori.10kb.domain.100nt.split.gr <- import("./Dataset/ori/ori.10kb.domain.100nt.split.hg19.NCBI.bed")
my.chr <- c(1:22, "X", "Y")
ori.10kb.domain.100nt.split.gr <- ori.10kb.domain.100nt.split.gr[seqnames(ori.10kb.domain.100nt.split.gr) %in% my.chr]
seqlevelsStyle(ori.10kb.domain.100nt.split.gr) <- "UCSC"
# Recover sequences and compute base composition statistics
ori.views <- Views(Hsapiens, ori.10kb.domain.100nt.split.gr)
ori.seq <- DNAStringSet(ori.views)
stat.AID.seq.df <- tibble()
for (i in 1:length(ori.seq)) {
print(i/length(ori.seq))
seq.i <- as.character(ori.seq[i])
C.count.i <- str_count(seq.i, pattern = "C")
G.count.i <- str_count(seq.i, pattern = "G")
AID.mot.plus.count.i <- str_count(seq.i, pattern = "[A|T][A|G]C[C|T]")
AID.mot.minus.count.i <- str_count(seq.i, pattern = "[G|A]G[T|C][T|A]")
df.i <- cbind.data.frame(C.count = C.count.i, G.count = G.count.i, AID.mot.plus.count = AID.mot.plus.count.i, AID.mot.minus.count = AID.mot.minus.count.i)
stat.AID.seq.df <- rbind(stat.AID.seq.df, df.i)
}
stat.AID.seq.df$bin <- ori.10kb.domain.100nt.split.gr$name
saveRDS(stat.AID.seq.df, "./002_Mut_signatures_Pan_cancer/rds/stat.AID.seq.df.rds")
stat.AID.seq.summary.df <- stat.AID.seq.df %>% group_by(bin) %>% summarise(C.count = sum(C.count), G.count = sum(G.count), AID.mot.plus.count = sum(AID.mot.plus.count), AID.mot.minus.count = sum(AID.mot.minus.count))
saveRDS(stat.AID.seq.summary.df, "./002_Mut_signatures_Pan_cancer/rds/stat.AID.seq.summary.df.rds")
Plot results
library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")
# Load results
cluster.4.AID.ratio.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.4.AID.ratio.df.rds")
stat.AID.seq.summary.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/rds/stat.AID.seq.summary.df.rds")
# Format and combine information
cluster.4.AID.ratio.df <- cluster.4.AID.ratio.df %>% dplyr::rename(value = ratio.AID) %>% mutate(data = "snvs")
stat.AID.seq.summary.2.df <- stat.AID.seq.summary.df %>% mutate(dist = (as.numeric(bin)-100)*100, value = (AID.mot.plus.count+AID.mot.minus.count)/(C.count+G.count)) %>% dplyr::select(dist, value) %>% mutate(data = "motif")
cluster.4.AID.ratio.summary.df <- rbind(cluster.4.AID.ratio.df, stat.AID.seq.summary.2.df)
cluster.4.AID.ratio.plot <- cluster.4.AID.ratio.summary.df %>% ggplot(aes(x = dist, y = value, color = data)) +
geom_line() + xlim(-5000, 5000) +
scale_color_manual(values = c("black", "#E7A106")) + ylim(0,0.25) +
xlab("Distance from origin (bp)") + ylab("Fraction at cAID motif") +
theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"))
cluster.4.AID.ratio.plot

pdf("./Rplots/cluster.4.AID.ratio.plot.pdf", width=7, height=6, useDingbats=FALSE)
cluster.4.AID.ratio.plot
dev.off()
## quartz_off_screen
## 2
AID gene
expression
Assess correlation between AID expression and mutagenesis at
origins.
Gene expression data (in TPM) is formatted as described in
/cephfs2/pmurat/OriCan/003_Transcriptomics_cluster_analysis_v3_1.Rmd
This is a pan cancer analysis as there is no transcriptomic data for
CMDI.
# r
# Load transcriptomics data
ICGC.transc.TPM.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.df.rds")
# Filter AID gene
AID.TPM.df <- ICGC.transc.TPM.df %>% filter(gene.id == "AICDA")
# Compute mean snvs density ratios in bin of AID expression
AID.TPM.summary.df <- AID.TPM.df %>% ungroup() %>% filter(TPM > 0) %>% mutate(bin = ntile(TPM, 10)) %>% group_by(bin) %>%
summarise(AID.TPM = mean(TPM, na.rm = T), snvs.dens.ratio.mean = mean(snvs.dens.ratio, na.rm = T), snvs.dens.ratio.sem = std.error(snvs.dens.ratio, na.rm = T))
saveRDS(AID.TPM.summary.df, "./002_Mut_signatures_Pan_cancer/rds/AID.TPM.summary.df.rds")
# Assess correlation
cor.test(AID.TPM.df$TPM, AID.TPM.df$snvs.dens.ratio, na.rm = T) # Rho = 0.2551857, p-value = 6.973e-12
Plot
library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")
AID.TPM.summary.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/AID.TPM.summary.df.rds")
AID.TPM.summary.plot <- AID.TPM.summary.df %>%
ggplot(aes(x=AID.TPM, y=snvs.dens.ratio.mean)) +
geom_line(color = "#E7A106") +
geom_errorbar(aes(ymin=snvs.dens.ratio.mean-snvs.dens.ratio.sem, ymax=snvs.dens.ratio.mean+snvs.dens.ratio.sem), width=.2, position=position_dodge(0.05), color = "#E7A106") +
geom_point(shape = 21, size = 2, fill = "white", color = "#E7A106") +
scale_x_continuous(trans = log10_trans(), breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) +
ylab("Mutation density ratio (ori/flanks)") + xlab("AICDA expression bin (log10 TPM)") + ggtitle("Rho = 0.255, p-value = 6.973e-12") +
theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"))
AID.TPM.summary.plot

pdf("./Rplots/AID.TPM.summary.plot.pdf", width=7, height=6, useDingbats=FALSE)
AID.TPM.summary.plot
dev.off()
## quartz_off_screen
## 2
APOBEC contribution
to origin mutagenesis
Assess the contribution of other cytidine deaminases to origin
mutagenesis.
We focus herein on the family of APOBEC3 and compare snvs density
ratio for MALY donors binned by APOBEC level of expression.
# r
# Load transcriptomics data
ICGC.transc.TPM.df <- readRDS("./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.df.rds")
# Select gene of interest (APOBEC3s and AID as positive control)
GOI <- c("AICDA", "APOBEC3A", "APOBEC3B", "APOBEC3C", "APOBEC3D", "APOBEC3F", "APOBEC3G", "APOBEC3H")
# Bin donors for high or low level expression levels for each GOI
APOBEC.df <- ICGC.transc.TPM.df %>% ungroup() %>% filter(gene.id %in% GOI) %>% group_by(gene.id) %>% filter(TPM >= 0.01) %>% mutate(TPM.bin = ntile(TPM, 3)) %>%
filter(TPM.bin != 2) %>% mutate(EXP = case_when(TPM.bin == 1 ~ "low", T ~ "high"))
saveRDS(APOBEC.df, "./002_Mut_signatures_Pan_cancer/rds/APOBEC.df.rds")
Plot
library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")
APOBEC.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/APOBEC.df.rds")
APOBEC.plot <- APOBEC.df %>%
ggplot(aes(x=gene.id, y=snvs.dens.ratio, fill = EXP)) +
geom_boxplot(alpha = 0.7) + ylab("Mutation density ratio (ori/flanks)") +
scale_fill_manual(values = c("#E7A208", "#D4D2D2")) +
geom_hline(yintercept=1, linetype="dashed") +
theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"), axis.title.x=element_blank(), axis.text.x=element_text(angle=45,hjust=1))
APOBEC.plot

# Compute stats
ks.test(APOBEC.df[which(APOBEC.df$gene.id == "AICDA" & APOBEC.df$EXP == "high"),]$snvs.dens.ratio, APOBEC.df[which(APOBEC.df$gene.id == "AICDA" & APOBEC.df$EXP == "low"),]$snvs.dens.ratio, alternative = "less")
##
## Asymptotic two-sample Kolmogorov-Smirnov test
##
## data: APOBEC.df[which(APOBEC.df$gene.id == "AICDA" & APOBEC.df$EXP == "high"), ]$snvs.dens.ratio and APOBEC.df[which(APOBEC.df$gene.id == "AICDA" & APOBEC.df$EXP == "low"), ]$snvs.dens.ratio
## D^- = 0.5, p-value < 2.2e-16
## alternative hypothesis: the CDF of x lies below that of y
# p-value < 2.2e-16
ks.test(APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3A" & APOBEC.df$EXP == "high"),]$snvs.dens.ratio, APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3A" & APOBEC.df$EXP == "low"),]$snvs.dens.ratio, alternative = "less")
##
## Asymptotic two-sample Kolmogorov-Smirnov test
##
## data: APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3A" & APOBEC.df$EXP == "high"), ]$snvs.dens.ratio and APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3A" & APOBEC.df$EXP == "low"), ]$snvs.dens.ratio
## D^- = 0.095941, p-value = 0.0952
## alternative hypothesis: the CDF of x lies below that of y
# p-value = 0.0952
ks.test(APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3B" & APOBEC.df$EXP == "high"),]$snvs.dens.ratio, APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3B" & APOBEC.df$EXP == "low"),]$snvs.dens.ratio, alternative = "less")
##
## Asymptotic two-sample Kolmogorov-Smirnov test
##
## data: APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3B" & APOBEC.df$EXP == "high"), ]$snvs.dens.ratio and APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3B" & APOBEC.df$EXP == "low"), ]$snvs.dens.ratio
## D^- = 0.3276, p-value = 4.696e-13
## alternative hypothesis: the CDF of x lies below that of y
# p-value = 4.697e-13
ks.test(APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3C" & APOBEC.df$EXP == "high"),]$snvs.dens.ratio, APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3C" & APOBEC.df$EXP == "low"),]$snvs.dens.ratio, alternative = "less")
##
## Asymptotic two-sample Kolmogorov-Smirnov test
##
## data: APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3C" & APOBEC.df$EXP == "high"), ]$snvs.dens.ratio and APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3C" & APOBEC.df$EXP == "low"), ]$snvs.dens.ratio
## D^- = 0.46067, p-value < 2.2e-16
## alternative hypothesis: the CDF of x lies below that of y
# p-value < 2.2e-16
ks.test(APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3D" & APOBEC.df$EXP == "high"),]$snvs.dens.ratio, APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3D" & APOBEC.df$EXP == "low"),]$snvs.dens.ratio, alternative = "less")
##
## Asymptotic two-sample Kolmogorov-Smirnov test
##
## data: APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3D" & APOBEC.df$EXP == "high"), ]$snvs.dens.ratio and APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3D" & APOBEC.df$EXP == "low"), ]$snvs.dens.ratio
## D^- = 0.35793, p-value = 8.882e-16
## alternative hypothesis: the CDF of x lies below that of y
# p-value = 8.346e-16
ks.test(APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3F" & APOBEC.df$EXP == "high"),]$snvs.dens.ratio, APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3F" & APOBEC.df$EXP == "low"),]$snvs.dens.ratio, alternative = "less")
##
## Asymptotic two-sample Kolmogorov-Smirnov test
##
## data: APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3F" & APOBEC.df$EXP == "high"), ]$snvs.dens.ratio and APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3F" & APOBEC.df$EXP == "low"), ]$snvs.dens.ratio
## D^- = 0.23985, p-value = 1.695e-07
## alternative hypothesis: the CDF of x lies below that of y
# p-value = 1.695e-07
ks.test(APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3G" & APOBEC.df$EXP == "high"),]$snvs.dens.ratio, APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3G" & APOBEC.df$EXP == "low"),]$snvs.dens.ratio, alternative = "less")
##
## Asymptotic two-sample Kolmogorov-Smirnov test
##
## data: APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3G" & APOBEC.df$EXP == "high"), ]$snvs.dens.ratio and APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3G" & APOBEC.df$EXP == "low"), ]$snvs.dens.ratio
## D^- = 0.38282, p-value < 2.2e-16
## alternative hypothesis: the CDF of x lies below that of y
# p-value < 2.2e-16
ks.test(APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3H" & APOBEC.df$EXP == "high"),]$snvs.dens.ratio, APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3H" & APOBEC.df$EXP == "low"),]$snvs.dens.ratio, alternative = "less")
##
## Asymptotic two-sample Kolmogorov-Smirnov test
##
## data: APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3H" & APOBEC.df$EXP == "high"), ]$snvs.dens.ratio and APOBEC.df[which(APOBEC.df$gene.id == "APOBEC3H" & APOBEC.df$EXP == "low"), ]$snvs.dens.ratio
## D^- = 0.090022, p-value = 0.1241
## alternative hypothesis: the CDF of x lies below that of y
# p-value = 0.1241
# Save plot
pdf("./Rplots/APOBEC.plot.pdf", width=7, height=6, useDingbats=FALSE)
APOBEC.plot
dev.off()
## quartz_off_screen
## 2
This analysis suggests that APOBEC3s may contribute to mutagenesis at
origins.
We now assess signs of APOBEC3s activity at origins through
mutational signature analysis.
# R
# Identify APOBEC signatures at origins in MALY genomes
# Load mutational differential matrix
snvs.diff.mut.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.diff.mut.mat.rds")
# Recover MALY donors and associated matrices
MALY.donor.id <- MALY.snvs.dens.ratio.df %>% filter(donor %in% colnames(snvs.diff.mut.mat))
MALY.mut.mat <- snvs.diff.mut.mat %>% as.data.frame() %>% dplyr::select(all_of(MALY.donor.id$donor)) # 33 genomes
# Recover known signatures (COSMIC signatures)
COSMIC.signatures <- get_known_signatures()
# Select signatures relevant to malignant lymphomas
MALY.select <- c("SBS1", "SBS2", "SBS3", "SBS5", "SBS6", "SBS9", "SBS10a", "SBS10b", "SBS13", "SBS14", "SBS15", "SBS18", "SBS20", "SBS21", "SBS26", "SBS30", "SBS36", "SBS44", "SBS84", "SBS85")
MALY.signatures <- COSMIC.signatures %>% as.data.frame() %>% dplyr::select(all_of(MALY.select)) %>% as.matrix()
# Fit COSMIC signatures to MALY genomes
MALY.fit.res <- fit_to_signatures_strict(MALY.mut.mat, MALY.signatures, max_delta = 0.004)
# Merge signatures that are non-related to deaminases
DEAMIN.signatures <- COSMIC.signatures %>% as.data.frame() %>% dplyr::select("SBS2", "SBS13", "SBS84", "SBS85") %>% as.matrix()
non.DEAMIN.signatures <- COSMIC.signatures %>% as.data.frame() %>% dplyr::select(-colnames(DEAMIN.signatures)) %>% as.matrix()
DEAMIN.fit.res <- MALY.fit.res$fit_res$contribution %>% as.data.frame() %>% mutate_all(~ ./sum(.))
DEAMIN.contr.df <- DEAMIN.fit.res[which(row.names(DEAMIN.fit.res) %in% colnames(DEAMIN.signatures)),]
no.DEAMIN.contr.df <- colSums(DEAMIN.fit.res[which(row.names(DEAMIN.fit.res) %in% colnames(non.DEAMIN.signatures)),])
DEAMIN.contribution <- rbind(DEAMIN.contr.df, no.DEAMIN.contr.df) %>% as.matrix()
row.names(DEAMIN.contribution) <- c("SBS2", "SBS13", "SBS84", "SBS85", "others")
# Melt df for plotting
DEAMIN.contribution.melt.df <- DEAMIN.contribution %>% as.data.frame() %>% rownames_to_column("SBS") %>% gather("donor", "value", -SBS)
saveRDS(DEAMIN.contribution.melt.df, "./002_Mut_signatures_Pan_cancer/rds/DEAMIN.contribution.melt.df.rds")
Plot contribution
library(dplyr)
library(ggplot2)
setwd("/Users/pm23/Desktop/Projects/OriCan")
DEAMIN.contribution.melt.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/DEAMIN.contribution.melt.df.rds")
plot.order <- DEAMIN.contribution.melt.df %>% filter(SBS == "others") %>% arrange(-dplyr::desc(value)) %>% mutate(donor = factor(donor))
DEAMIN.contribution.melt.plot <- DEAMIN.contribution.melt.df %>% mutate(donor = factor(donor, levels = plot.order$donor, ordered = TRUE)) %>%
ggplot(aes(fill=SBS, y=value, x=donor)) +
geom_bar(position="stack", stat="identity") + ylab("Relative contribution") +
scale_fill_manual(values = c("#D4D2D2", "#33BAED", "#4F4A75", "#E6A004", "#E41F1A")) +
theme_bw() + theme(aspect.ratio=0.7, axis.title.x=element_blank(), panel.grid.major = element_blank(), axis.text.x=element_text(angle=45,hjust=1))
DEAMIN.contribution.melt.plot

# Save plot
pdf("./Rplots/DEAMIN.contribution.melt.plot.pdf", width=7, height=6, useDingbats=FALSE)
DEAMIN.contribution.melt.plot
dev.off()
## quartz_off_screen
## 2